def group_by_window_agg_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_group_by_window_agg_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a", "start", "end", "rowtime", "d"], [ DataTypes.STRING(), DataTypes.TIMESTAMP(), DataTypes.TIMESTAMP(), DataTypes.TIMESTAMP(), DataTypes.INT() ], result_file)) orders = bt_env.scan("Orders") result = orders.window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("a, w") \ .select("a, w.start, w.end, w.rowtime, b.sum as d") result.insert_into("result") bt_env.execute("group by agg batch")
def transactions_job(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) st_env = StreamTableEnvironment \ .create(s_env, environment_settings=EnvironmentSettings .new_instance() .in_streaming_mode() .use_blink_planner().build()) register_transactions_source(st_env) register_transactions_es_sink(st_env) st_env.from_path("source") \ .window(Tumble.over("10.hours").on("rowtime").alias("w")) \ .group_by("customer, w") \ .select("""customer as customer, count(transaction_type) as count_transactions, sum(online_payment_amount) as total_online_payment_amount, sum(in_store_payment_amount) as total_in_store_payment_amount, last(lat) as lat, last(lon) as lon, w.end as last_transaction_time """) \ .filter("total_online_payment_amount<total_in_store_payment_amount") \ .filter("count_transactions>=3") \ .filter("lon < 20.62") \ .filter("lon > 20.20") \ .filter("lat < 44.91") \ .filter("lat > 44.57") \ .insert_into("sink_into_csv") st_env.execute("app")
def test_tumbling_group_window_over_time(self): # create source file path import tempfile import os tmp_dir = tempfile.gettempdir() data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:30:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', ] source_path = tmp_dir + '/test_tumbling_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.create_temporary_system_function( "my_count", CountDistinctAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c INT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c', 'd'], [ DataTypes.TINYINT(), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.BIGINT() ]) self.t_env.register_table_sink("Results", table_sink) t.window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("a, w") \ .select("a, w.start, w.end, my_count(c) as c") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "+I[2, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 1]", "+I[3, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 1]", "+I[1, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2]", "+I[1, 2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 1]" ]) os.remove(source_path)
def max_travellers_per_destination(): env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env) t_env.execute_sql( create_table_ddl( "WATERMARK FOR dropOffTime AS dropOffTime - INTERVAL '30' SECONDS") ) taxi_ride = t_env.from_path('TaxiRide') no_of_travelers_per_dest = taxi_ride \ .select(taxi_ride.passengerCount, taxi_ride.dropOffTime, taxi_ride.destLocationZone) \ .window(Tumble().over('1.hour').on(taxi_ride.dropOffTime).alias('w')) \ .group_by(taxi_ride.destLocationZone, col('w')) \ .select(taxi_ride.destLocationZone, \ col('w').start.alias('start'), \ col('w').end.alias('end'), \ taxi_ride.passengerCount.count.alias('cnt')) t_env.to_append_stream( no_of_travelers_per_dest, Types.ROW_NAMED(['destLocationZone', 'start', 'end', 'cnt'], [ Types.STRING(), Types.SQL_TIMESTAMP(), Types.SQL_TIMESTAMP(), Types.LONG() ])).print() env.execute('Max-Travellers-Per-Destination')
def test_tumbling_group_window_over_time(self): # create source file path import tempfile import os tmp_dir = tempfile.gettempdir() data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_tumbling_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') from pyflink.table.window import Tumble self.env.set_stream_time_characteristic(TimeCharacteristic.EventTime) self.t_env.register_function("mean_udaf", mean_udaf) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd'], [ DataTypes.TINYINT(), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.FLOAT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("a, b, w") \ .select("a, w.start, w.end, mean_udaf(c) as b") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["1,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.5", "1,2018-03-11 04:00:00.0,2018-03-11 05:00:00.0,8.0", "2,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.0", "3,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.0"]) os.remove(source_path)
def test_tumble_group_window_aggregate_function(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) sink_table_ddl = """ CREATE TABLE Results(a TIMESTAMP(3), b TIMESTAMP(3), c FLOAT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) tumble_window = Tumble.over(lit(1).hours) \ .on(col("rowtime")) \ .alias("w") t.window(tumble_window) \ .group_by(col("w")) \ .select(col("w").start, col("w").end, mean_udaf(t.b)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[2018-03-11T03:00, 2018-03-11T04:00, 2.2]", "+I[2018-03-11T04:00, 2018-03-11T05:00, 8.0]"])
def test_tumbling_group_window_over_time(self): # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:30:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', ] source_path = tmp_dir + '/test_tumbling_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.create_temporary_system_function( "my_count", CountDistinctAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c INT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils sink_table_ddl = """ CREATE TABLE Results(a TINYINT, b TIMESTAMP(3), c TIMESTAMP(3), d BIGINT, e BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) t.window(Tumble.over(lit(1).hours).on(t.rowtime).alias("w")) \ .group_by(t.a, col("w")) \ .select(t.a, col("w").start, col("w").end, t.c.count.alias("c"), call("my_count", t.c).alias("d")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "+I[2, 2018-03-11T03:00, 2018-03-11T04:00, 2, 1]", "+I[3, 2018-03-11T03:00, 2018-03-11T04:00, 1, 1]", "+I[1, 2018-03-11T03:00, 2018-03-11T04:00, 2, 2]", "+I[1, 2018-03-11T04:00, 2018-03-11T05:00, 1, 1]" ])
def test_tumble_window(self): t = self.t_env.from_elements([(1, 1, "Hello")], ["a", "b", "c"]) result = t.window(Tumble.over(expr.row_interval(2)).on("a").alias("w"))\ .group_by(expr.col('w'), expr.col('c')).select(t.b.sum) query_operation = result._j_table.getQueryOperation().getChildren().get(0) self.assertEqual('[c]', query_operation.getGroupingExpressions().toString()) self.assertEqual('TumbleWindow(field: [a], size: [2])', query_operation.getGroupWindow().asSummaryString())
def test_tumbling_group_window_over_count(self): self.t_env.get_config().set("parallelism.default", "1") # create source file path import tempfile import os tmp_dir = tempfile.gettempdir() data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00', '3,3,3,2018-03-11 03:30:00', '1,1,4,2018-03-11 04:20:00', ] source_path = tmp_dir + '/test_group_window_aggregate_function_over_count.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') from pyflink.table.window import Tumble self.t_env.get_config().set( "pipeline.time-characteristic", "ProcessingTime") self.t_env.register_function("mean_udaf", mean_udaf) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, protime as PROCTIME() ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") table_sink = source_sink_utils.TestAppendSink( ['a', 'd'], [ DataTypes.TINYINT(), DataTypes.FLOAT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Tumble.over(row_interval(2)).on(t.protime).alias("w")) \ .group_by(t.a, t.b, col("w")) \ .select(t.a, mean_udaf(t.c).alias("b")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 2.5]", "+I[1, 6.0]", "+I[2, 2.0]", "+I[3, 2.5]"]) os.remove(source_path)
def perform_tumbling_window_aggregation(input_table_name): # use SQL Table in the Table API input_table = table_env.from_path(input_table_name) tumbling_window_table = (input_table.window( Tumble.over("1.minute").on("EVENT_TIME").alias("one_minute_window") ).group_by("TICKER, one_minute_window").select( "TICKER, PRICE.avg as PRICE, one_minute_window.end as EVENT_TIME")) return tumbling_window_table
def count_by_word(input_table_name): # use SQL Table in the Table API input_table = table_env.from_path(input_table_name) tumbling_window_table = (input_table.window( Tumble.over("1.minute").on("event_time").alias("one_minute_window") ).group_by("ticker, one_minute_window").select( "ticker, price.avg as price, one_minute_window.end as event_time")) return tumbling_window_table
def test_tumble_window(self): t = self.t_env.from_elements([(1, 1, "Hello"), (2, 2, "Hello"), (3, 4, "Hello"), (4, 8, "Hello")], ["a", "b", "c"]) result = t.window(Tumble.over("2.rows").on("a").alias("w"))\ .group_by("w, c").select("b.sum") actual = self.collect(result) expected = ['3', '12'] self.assert_equals(actual, expected)
def perform_tumbling_window_aggregation(input_table_name): # use SQL Table in the Table API input_table = table_env.from_path(input_table_name) tumbling_window_table = (input_table.window( Tumble.over("10.seconds").on("event_time").alias("ten_second_window") ).group_by("ticker, ten_second_window").select( "ticker, price.sum as price, ten_second_window.end as event_time")) return tumbling_window_table
def demo01(): # environment configuration t_env = BatchTableEnvironment.create(environment_settings=EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build()) # register Orders table and Result table sink in table environment source_data_path = "/path/to/source/directory/" result_data_path = "/path/to/result/directory/" source_ddl = f""" create table Orders( a VARCHAR, b BIGINT, c BIGINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '1' SECOND ) with ( 'connector' = 'filesystem', 'format' = 'csv', 'path' = '{source_data_path}' ) """ t_env.execute_sql(source_ddl) sink_ddl = f""" create table `Result`( a VARCHAR, cnt BIGINT ) with ( 'connector' = 'filesystem', 'format' = 'csv', 'path' = '{result_data_path}' ) """ t_env.execute_sql(sink_ddl) # specify table program orders = t_env.from_path("Orders") # schema (a, b, c, rowtime) orders.group_by("a").select(orders.a, orders.b.count.alias('cnt')).execute_insert("result").wait() orders.where(orders.a == 'red') orders.filter(orders.b % 2 == 0) orders.add_columns(concat(orders.c, 'sunny')) orders.add_or_replace_columns(concat(orders.c, 'sunny').alias('desc')) orders.drop_columns(orders.b, orders.c) orders.rename_columns(orders.b.alias('b2'), orders.c.alias('c2')) orders.group_by(orders.a).select(orders.a, orders.b.sum.alias('d')) # tab.group_by(tab.key).select(tab.key, tab.value.avg.alias('average')) # tab.group_by("key").select("key, value.avg as average") result = orders.filter(orders.a.is_not_null & orders.b.is_not_null & orders.c.is_not_null) \ .select(orders.a.lower_case.alias('a'), orders.b, orders.rowtime) \ .window(Tumble.over(lit(1).hour).on(orders.rowtime).alias("hourly_window")) \ .group_by(col('hourly_window'), col('a')) \ .select(col('a'), col('hourly_window').end.alias('hour'), col('b').avg.alias('avg_billing_amount')) """
def pandas_udaf(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source with watermark definition ds = env.from_collection( collection=[ (Instant.of_epoch_milli(1000), 'Alice', 110.1), (Instant.of_epoch_milli(4000), 'Bob', 30.2), (Instant.of_epoch_milli(3000), 'Alice', 20.0), (Instant.of_epoch_milli(2000), 'Bob', 53.1), (Instant.of_epoch_milli(5000), 'Alice', 13.1), (Instant.of_epoch_milli(3000), 'Bob', 3.1), (Instant.of_epoch_milli(7000), 'Bob', 16.1), (Instant.of_epoch_milli(10000), 'Alice', 20.1) ], type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()])) table = t_env.from_data_stream( ds, Schema.new_builder() .column_by_expression("ts", "CAST(f0 AS TIMESTAMP_LTZ(3))") .column("f1", DataTypes.STRING()) .column("f2", DataTypes.FLOAT()) .watermark("ts", "ts - INTERVAL '3' SECOND") .build() ).alias("ts, name, price") # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print') .schema(Schema.new_builder() .column('name', DataTypes.STRING()) .column('total_price', DataTypes.FLOAT()) .column('w_start', DataTypes.TIMESTAMP_LTZ()) .column('w_end', DataTypes.TIMESTAMP_LTZ()) .build()) .build()) @udaf(result_type=DataTypes.FLOAT(), func_type="pandas") def mean_udaf(v): return v.mean() # define the tumble window operation table = table.window(Tumble.over(lit(5).seconds).on(col("ts")).alias("w")) \ .group_by(table.name, col('w')) \ .select(table.name, mean_udaf(table.price), col("w").start, col("w").end) # submit for execution table.execute_insert('sink') \ .wait()
def perform_tumbling_window_aggregation(input_table_name): # use SQL Table in the Table API input_table = table_env.from_path(input_table_name) tumbling_window_table = ( input_table.window( Tumble.over("10.seconds").on("EVENT_TIME").alias("ten_second_window") ) .group_by("TICKER, ten_second_window") .select("TICKER, PRICE.sum as PRICE, ten_second_window.end as EVENT_TIME") ) return tumbling_window_table
def test_tumbling_group_window_over_count(self): self.t_env.get_config().get_configuration().set_string("parallelism.default", "1") # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00', '3,3,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_tumbling_group_window_over_count.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.register_function("my_sum", SumAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, protime as PROCTIME() ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils table_sink = source_sink_utils.TestAppendSink( ['a', 'd'], [ DataTypes.TINYINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Tumble.over("2.rows").on("protime").alias("w")) \ .group_by("a, w") \ .select("a, my_sum(c) as b") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 5]", "+I[2, 4]", "+I[3, 5]"])
def test_tumble_window(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.BIGINT(), DataTypes.INT(), DataTypes.STRING()] data = [(1, 1, "Hello"), (2, 2, "Hello"), (3, 4, "Hello"), (4, 8, "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("Source", csv_source) source = t_env.scan("Source") result = source.window(Tumble.over("2.rows").on("a").alias("w"))\ .group_by("w, c").select("b.sum") actual = self.collect(result) expected = ['3', '12'] self.assert_equals(actual, expected)
def test_tumble_window(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.LONG, DataTypes.INT, DataTypes.STRING] data = [(1, 1, "Hello"), (2, 2, "Hello"), (3, 4, "Hello"), (4, 8, "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("Source", csv_source) source = t_env.scan("Source") result = source.window(Tumble.over("2.rows").on("a").alias("w"))\ .group_by("w, c").select("b.sum") actual = self.collect(result) expected = ['3', '12'] self.assert_equals(actual, expected)
def test_tumbling_group_window_over_count(self): self.t_env.get_config().set("parallelism.default", "1") # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00', '3,3,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_tumbling_group_window_over_count.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.register_function("my_sum", SumAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, protime as PROCTIME() ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils sink_table_ddl = """ CREATE TABLE Results(a TINYINT, d BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) t.window(Tumble.over(row_interval(2)).on(t.protime).alias("w")) \ .group_by(t.a, col("w")) \ .select(t.a, call("my_sum", t.c).alias("b")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 5]", "+I[2, 4]", "+I[3, 5]"])
def test_window_aggregate_with_pandas_udaf(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [ DataTypes.TIMESTAMP(3), DataTypes.FLOAT(), DataTypes.INT() ]) self.t_env.register_table_sink("Results", table_sink) pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT())]), func_type="pandas") tumble_window = Tumble.over(expr.lit(1).hours) \ .on(expr.col("rowtime")) \ .alias("w") t.select(t.b, t.rowtime) \ .window(tumble_window) \ .group_by("w") \ .aggregate(pandas_udaf.alias("d", "e")) \ .select("w.rowtime, d, e") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["2018-03-11 03:59:59.999,2.2,3", "2018-03-11 04:59:59.999,8.0,8"])
def test_window_aggregate_with_pandas_udaf(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) sink_table_ddl = """ CREATE TABLE Results(a TIMESTAMP(3), b FLOAT, c INT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) print(t.get_schema()) pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT())]), func_type="pandas") tumble_window = Tumble.over(expr.lit(1).hours) \ .on(expr.col("rowtime")) \ .alias("w") t.select(t.b, t.rowtime) \ .window(tumble_window) \ .group_by(expr.col("w")) \ .aggregate(pandas_udaf.alias("d", "e")) \ .select(expr.col("w").rowtime, expr.col("d"), expr.col("e")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[2018-03-11 03:59:59.999, 2.2, 3]", "+I[2018-03-11 04:59:59.999, 8.0, 8]"])
def test_tumble_group_window_aggregate_function(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [ DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.FLOAT() ]) self.t_env.register_table_sink("Results", table_sink) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) tumble_window = Tumble.over(lit(1).hours) \ .on(col("rowtime")) \ .alias("w") t.window(tumble_window) \ .group_by(col("w")) \ .select(col("w").start, col("w").end, mean_udaf(t.b)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.2]", "+I[2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 8.0]"])
def tumble_time_window_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_tumble_time_window_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a"], [DataTypes.INT()], result_file)) orders = bt_env.scan("Orders") result = orders.window(Tumble.over("30.minutes").on("rowtime").alias("w")) \ .group_by("w, a").select("b.sum") result.insert_into("result") bt_env.execute("tumble time window batch")
def test_tumbling_group_window_over_time(self): # create source file path import tempfile import os tmp_dir = tempfile.gettempdir() data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_tumbling_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') from pyflink.table.window import Tumble self.t_env.get_config().set( "pipeline.time-characteristic", "EventTime") self.t_env.register_function("mean_udaf", mean_udaf) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") sink_table_ddl = """ CREATE TABLE Results( a TINYINT, b TIMESTAMP(3), c TIMESTAMP(3), d TIMESTAMP(3), e FLOAT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) t.window(Tumble.over(lit(1).hours).on(t.rowtime).alias("w")) \ .group_by(t.a, t.b, col("w")) \ .select(t.a, col("w").start, col("w").end, col("w").rowtime, mean_udaf(t.c).alias("b")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "+I[1, 2018-03-11T03:00, 2018-03-11T04:00, 2018-03-11T03:59:59.999, 2.5]", "+I[1, 2018-03-11T04:00, 2018-03-11T05:00, 2018-03-11T04:59:59.999, 8.0]", "+I[2, 2018-03-11T03:00, 2018-03-11T04:00, 2018-03-11T03:59:59.999, 2.0]", "+I[3, 2018-03-11T03:00, 2018-03-11T04:00, 2018-03-11T03:59:59.999, 2.0]", ]) os.remove(source_path)
.connect(custom_connector) \ .with_format( custom_format ) \ .with_schema( # declare the schema of the table Schema() .field("proctime", DataTypes.TIMESTAMP()) .proctime() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("source") st_env.register_table_sink("result", CsvTableSink(["a", "b"], [DataTypes.STRING(), DataTypes.STRING()], result_file)) st_env.scan("source").window(Tumble.over("2.rows").on("proctime").alias("w")) \ .group_by("w, a") \ .select("a, max(b)").insert_into("result") st_env.execute("custom kafka source demo") # cat /tmp/custom_kafka_source_demo.csv # a,3 # b,4 # a 5
def custom_kafka_source_demo(): custom_connector = CustomConnectorDescriptor('kafka', 1, True) \ .property('connector.topic', 'user') \ .property('connector.properties.0.key', 'zookeeper.connect') \ .property('connector.properties.0.value', 'localhost:2181') \ .property('connector.properties.1.key', 'bootstrap.servers') \ .property('connector.properties.1.value', 'localhost:9092') \ .properties({'connector.version': '0.11', 'connector.startup-mode': 'earliest-offset'}) # the key is 'format.json-schema' custom_format = CustomFormatDescriptor('json', 1) \ .property('format.json-schema', "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}") \ .properties({'format.fail-on-missing-field': 'true'}) s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime) st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/custom_kafka_source_demo.csv" if os.path.exists(result_file): os.remove(result_file) st_env \ .connect(custom_connector) \ .with_format( custom_format ) \ .with_schema( # declare the schema of the table Schema() .field("proctime", DataTypes.TIMESTAMP()) .proctime() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("source") st_env.register_table_sink( "result", CsvTableSink( ["a", "b"], [DataTypes.STRING(), DataTypes.STRING()], result_file)) st_env.scan("source").window(Tumble.over("2.rows").on("proctime").alias("w")) \ .group_by("w, a") \ .select("a, max(b)").insert_into("result") st_env.execute("custom kafka source demo")
def pv_uv_demo(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user_behavior") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " user_id: {" " type: 'string'" " }," " item_id: {" " type: 'string'" " }," " category_id: {" " type: 'string'" " }," " behavior: {" " type: 'string'" " }," " ts: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("user_id", DataTypes.STRING()) .field("item_id", DataTypes.STRING()) .field("category_id", DataTypes.STRING()) .field("behavior", DataTypes.STRING()) .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("ts") .watermarks_periodic_bounded(60000)) ) \ .in_append_mode() \ .register_table_source("source") # use custom retract sink connector custom_connector = CustomConnectorDescriptor('jdbc', 1, False) \ .property("connector.driver", "org.apache.derby.jdbc.ClientDriver") \ .property("connector.url", "jdbc:derby://localhost:1527/firstdb") \ .property("connector.table", "pv_uv_table") \ .property("connector.write.flush.max-rows", "1") st_env.connect(custom_connector) \ .with_schema( Schema() .field("startTime", DataTypes.TIMESTAMP()) .field("endTime", DataTypes.TIMESTAMP()) .field("pv", DataTypes.BIGINT()) .field("uv", DataTypes.BIGINT()) ).register_table_sink("sink") st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("w") \ .select("w.start as startTime, w.end as endTime, COUNT(1) as pv, user_id.count.distinct as uv").insert_into("sink") st_env.execute("table pv uv")
" page: {" " type: 'string'" " }," " count: {" " type: 'number'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("windowStart", DataTypes.TIMESTAMP()).proctime() .field("windowEnd", DataTypes.TIMESTAMP()).proctime() .field("page", DataTypes.STRING()) .field('count', DataTypes.BIGINT()) ) \ .in_append_mode() \ .register_table_sink("ClickEventStatistics Sink") # Tumble window aggregation # 15 sec window st_env.scan("ClickEvent Source") \ .window(Tumble.over("15.seconds").on("timestamp").alias("w")) \ .group_by("w, page") \ .select("w.start as windowStart,w.end as windowEnd,page,count(1) as count") \ .insert_into("ClickEventStatistics Sink") # Now run it st_env.execute("Click Event Count")
def distinct_agg_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("time") .watermarks_periodic_bounded(60000)) .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("Orders") st_env.connect( Elasticsearch() .version("6") .host("localhost", 9200, "http") .index("distinct_agg_streaming") .document_type('pyflink') .key_delimiter("_") .key_null_literal("null") .failure_handler_ignore() .disable_flush_on_checkpoint() .bulk_flush_max_actions(2) .bulk_flush_max_size("1 mb") .bulk_flush_interval(5000) ) \ .with_schema( Schema() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) ) \ .with_format( Json() .derive_schema() ) \ .in_upsert_mode() \ .register_table_sink("result") orders = st_env.scan("Orders") result = orders.window(Tumble.over("30.minutes").on("rowtime").alias("w")) \ .group_by("a, w").select("a, b.max.distinct as d") result.insert_into("result") st_env.execute("distinct agg streaming")
def tumble_time_window_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/tumble_time_window_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("time") .watermarks_periodic_bounded(60000)) .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("source") st_env.register_table_sink("result", CsvTableSink(["a", "b"], [DataTypes.STRING(), DataTypes.STRING()], result_file)) st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("w, a") \ .select("a, max(b)").insert_into("result") st_env.execute("tumble time window streaming")