def test_session_window(self): t = self.t_env.from_elements([(1000, 1, "Hello")], ["a", "b", "c"]) result = t.window(Session.with_gap(expr.lit(1).seconds).on("a").alias("w"))\ .group_by(expr.col('w'), expr.col('c')).select(t.b.sum) query_operation = result._j_table.getQueryOperation().getChildren().get(0) self.assertEqual('[c]', query_operation.getGroupingExpressions().toString()) self.assertEqual('SessionWindow(field: [a], gap: [1000])', query_operation.getGroupWindow().asSummaryString())
def test_session_group_window_over_time(self): # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_session_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.register_function("my_count", CountAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd'], [ DataTypes.TINYINT(), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Session.with_gap("30.minutes").on("rowtime").alias("w")) \ .group_by("a, b, w") \ .select("a, w.start, w.end, my_count(c) as c") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[3, 2018-03-11 03:10:00.0, 2018-03-11 03:40:00.0, 1]", "+I[2, 2018-03-11 03:10:00.0, 2018-03-11 04:00:00.0, 2]", "+I[1, 2018-03-11 03:10:00.0, 2018-03-11 04:10:00.0, 2]", "+I[1, 2018-03-11 04:20:00.0, 2018-03-11 04:50:00.0, 1]"])
def test_session_window(self): t = self.t_env.from_elements([(1000, 1, "Hello"), (2000, 2, "Hello"), (4000, 4, "Hello"), (5000, 8, "Hello")], ["a", "b", "c"]) result = t.window(Session.with_gap("1.seconds").on("a").alias("w"))\ .group_by("w, c").select("b.sum") actual = self.collect(result) expected = ['3', '12'] self.assert_equals(actual, expected)
def test_session_group_window_over_time(self): # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_session_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.register_function("my_count", CountAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils sink_table_ddl = """ CREATE TABLE Results(a TINYINT, b TIMESTAMP(3), c TIMESTAMP(3), d BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) t.window(Session.with_gap(lit(30).minutes).on(t.rowtime).alias("w")) \ .group_by(t.a, t.b, col("w")) \ .select(t.a, col("w").start, col("w").end, call("my_count", t.c).alias("c")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "+I[3, 2018-03-11T03:10, 2018-03-11T03:40, 1]", "+I[2, 2018-03-11T03:10, 2018-03-11T04:00, 2]", "+I[1, 2018-03-11T03:10, 2018-03-11T04:10, 2]", "+I[1, 2018-03-11T04:20, 2018-03-11T04:50, 1]" ])
def test_session_window(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.LONG, DataTypes.INT, DataTypes.STRING] data = [(1000, 1, "Hello"), (2000, 2, "Hello"), (4000, 4, "Hello"), (5000, 8, "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("Source", csv_source) source = t_env.scan("Source") result = source.window(Session.with_gap("1.seconds").on("a").alias("w"))\ .group_by("w, c").select("b.sum") actual = self.collect(result) expected = ['3', '12'] self.assert_equals(actual, expected)
def test_session_window(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.BIGINT(), DataTypes.INT(), DataTypes.STRING()] data = [(1000, 1, "Hello"), (2000, 2, "Hello"), (4000, 4, "Hello"), (5000, 8, "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("Source", csv_source) source = t_env.scan("Source") result = source.window(Session.with_gap("1.seconds").on("a").alias("w"))\ .group_by("w, c").select("b.sum") actual = self.collect(result) expected = ['3', '12'] self.assert_equals(actual, expected)
def session_window_demo(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source with watermark definition ds = env.from_collection( collection=[ (Instant.of_epoch_milli(1000), 'Alice', 110.1), (Instant.of_epoch_milli(4000), 'Bob', 30.2), (Instant.of_epoch_milli(3000), 'Alice', 20.0), (Instant.of_epoch_milli(2000), 'Bob', 53.1), (Instant.of_epoch_milli(8000), 'Bob', 16.1), (Instant.of_epoch_milli(10000), 'Alice', 20.1) ], type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()])) table = t_env.from_data_stream( ds, Schema.new_builder() .column_by_expression("ts", "CAST(f0 AS TIMESTAMP(3))") .column("f1", DataTypes.STRING()) .column("f2", DataTypes.FLOAT()) .watermark("ts", "ts - INTERVAL '3' SECOND") .build() ).alias("ts, name, price") # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print') .schema(Schema.new_builder() .column('name', DataTypes.STRING()) .column('total_price', DataTypes.FLOAT()) .column('w_start', DataTypes.TIMESTAMP_LTZ()) .column('w_end', DataTypes.TIMESTAMP_LTZ()) .build()) .build()) # define the session window operation table = table.window(Session.with_gap(lit(5).seconds).on(col("ts")).alias("w")) \ .group_by(table.name, col('w')) \ .select(table.name, table.price.sum, col("w").start, col("w").end) # submit for execution table.execute_insert('sink') \ .wait()
def session_time_window_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_session_time_window_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a"], [DataTypes.INT()], result_file)) orders = bt_env.scan("Orders") result = orders.window(Session.with_gap("10.minutes").on("rowtime").alias("w")) \ .group_by("w").select("b.sum") result.insert_into("result") bt_env.execute("session time window batch")
def session_time_window_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/session_time_window_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("time") .watermarks_periodic_bounded(60000)) .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("source") st_env.register_table_sink( "result", CsvTableSink( ["a", "b"], [DataTypes.STRING(), DataTypes.STRING()], result_file)) st_env.scan("source").window(Session.with_gap("10.minutes").on("rowtime").alias("w")) \ .group_by("w, a") \ .select("a, max(b)").insert_into("result") st_env.execute("session time window streaming")