Exemplo n.º 1
0
    def test_read_csv(self):
        from storey import ReadCSV, ReduceToDataFrame, build_flow

        csv_path = str(self.results_path / _generate_random_name() / ".csv")
        targets = [CSVTarget("mycsv", path=csv_path)]
        stocks_set = fs.FeatureSet(
            "tests", entities=[Entity("ticker", ValueType.STRING)])
        fs.ingest(stocks_set,
                  stocks,
                  infer_options=fs.InferOptions.default(),
                  targets=targets)

        # reading csv file
        controller = build_flow([ReadCSV(csv_path), ReduceToDataFrame()]).run()
        termination_result = controller.await_termination()

        expected = pd.DataFrame({
            0: ["ticker", "MSFT", "GOOG", "AAPL"],
            1: ["name", "Microsoft Corporation", "Alphabet Inc", "Apple Inc"],
            2: ["exchange", "NASDAQ", "NASDAQ", "NASDAQ"],
        })

        assert termination_result.equals(
            expected), f"{termination_result}\n!=\n{expected}"
        os.remove(csv_path)
 self._flow = build_flow([
     SyncEmitSource(),
     ProcessEndpointEvent(self.kv_container, self.kv_path),
     FilterNotNone(),
     FlatMap(lambda x: x),
     MapFeatureNames(self.kv_container, self.kv_path),
     # Branch 1: Aggregate events, count averages and update TSDB and KV
     [
         AggregateByKey(
             aggregates=[
                 FieldAggregator(
                     PREDICTIONS,
                     ENDPOINT_ID,
                     ["count"],
                     SlidingWindows(
                         self.aggregate_count_windows,
                         self.aggregate_count_period,
                     ),
                 ),
                 FieldAggregator(
                     LATENCY,
                     LATENCY,
                     ["avg"],
                     SlidingWindows(
                         self.aggregate_avg_windows,
                         self.aggregate_avg_period,
                     ),
                 ),
             ],
             table=Table("notable", NoopDriver()),
         ),
         SampleWindow(
             self.sample_window
         ),  # Add required gap between event to apply sampling
         Map(self.compute_predictions_per_second),
         # Branch 1.1: Updated KV
         [
             Map(self.process_before_kv),
             WriteToKV(container=self.kv_container, table=self.kv_path),
             InferSchema(
                 v3io_access_key=self.v3io_access_key,
                 v3io_framesd=self.v3io_framesd,
                 container=self.kv_container,
                 table=self.kv_path,
             ),
         ],
         # Branch 1.2: Update TSDB
         [
             # Map the event into taggable fields, add record type to each field
             Map(self.process_before_events_tsdb),
             [
                 FilterKeys(BASE_METRICS),
                 UnpackValues(BASE_METRICS),
                 TSDBTarget(
                     path=self.tsdb_path,
                     rate="10/m",
                     time_col=TIMESTAMP,
                     container=self.tsdb_container,
                     access_key=self.v3io_access_key,
                     v3io_frames=self.v3io_framesd,
                     index_cols=[ENDPOINT_ID, RECORD_TYPE],
                     # Settings for _Batching
                     max_events=self.tsdb_batching_max_events,
                     timeout_secs=self.tsdb_batching_timeout_secs,
                     key=ENDPOINT_ID,
                 ),
             ],
             [
                 FilterKeys(ENDPOINT_FEATURES),
                 UnpackValues(ENDPOINT_FEATURES),
                 TSDBTarget(
                     path=self.tsdb_path,
                     rate="10/m",
                     time_col=TIMESTAMP,
                     container=self.tsdb_container,
                     access_key=self.v3io_access_key,
                     v3io_frames=self.v3io_framesd,
                     index_cols=[ENDPOINT_ID, RECORD_TYPE],
                     # Settings for _Batching
                     max_events=self.tsdb_batching_max_events,
                     timeout_secs=self.tsdb_batching_timeout_secs,
                     key=ENDPOINT_ID,
                 ),
             ],
             [
                 FilterKeys(CUSTOM_METRICS),
                 FilterNotNone(),
                 UnpackValues(CUSTOM_METRICS),
                 TSDBTarget(
                     path=self.tsdb_path,
                     rate="10/m",
                     time_col=TIMESTAMP,
                     container=self.tsdb_container,
                     access_key=self.v3io_access_key,
                     v3io_frames=self.v3io_framesd,
                     index_cols=[ENDPOINT_ID, RECORD_TYPE],
                     # Settings for _Batching
                     max_events=self.tsdb_batching_max_events,
                     timeout_secs=self.tsdb_batching_timeout_secs,
                     key=ENDPOINT_ID,
                 ),
             ],
         ],
     ],
     # Branch 2: Batch events, write to parquet
     [
         Map(self.process_before_parquet),
         ParquetTarget(
             path=self.parquet_path,
             partition_cols=[
                 "$key", "$year", "$month", "$day", "$hour"
             ],
             infer_columns_from_data=True,
             # Settings for _Batching
             max_events=self.parquet_batching_max_events,
             timeout_secs=self.parquet_batching_timeout_secs,
         ),
     ],
 ]).run()