def test_sliding_group_window_over_count(self): self.t_env.get_config().set("parallelism.default", "1") # create source file path import tempfile import os tmp_dir = tempfile.gettempdir() data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00', '3,3,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_sliding_group_window_over_count.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') from pyflink.table.window import Slide self.t_env.get_config().set( "pipeline.time-characteristic", "ProcessingTime") self.t_env.register_function("mean_udaf", mean_udaf) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, protime as PROCTIME() ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") table_sink = source_sink_utils.TestAppendSink( ['a', 'd'], [ DataTypes.TINYINT(), DataTypes.FLOAT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Slide.over(row_interval(2)) .every(row_interval(1)) .on(t.protime) .alias("w")) \ .group_by(t.a, t.b, col("w")) \ .select(t.a, mean_udaf(t.c).alias("b")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 2.5]", "+I[1, 5.5]", "+I[2, 2.0]", "+I[3, 2.5]"]) os.remove(source_path)
def test_sliding_group_window_over_count(self): self.t_env.get_config().get_configuration().set_string("parallelism.default", "1") # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00', '3,3,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_sliding_group_window_over_count.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.register_function("my_sum", SumAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, protime as PROCTIME() ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils table_sink = source_sink_utils.TestAppendSink( ['a', 'd'], [ DataTypes.TINYINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Slide.over(row_interval(2)).every(row_interval(1)).on(t.protime).alias("w")) \ .group_by(t.a, col("w")) \ .select(t.a, call("my_sum", t.c).alias("b")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 5]", "+I[1, 11]", "+I[2, 4]", "+I[3, 5]"])
def test_tumble_window(self): t = self.t_env.from_elements([(1, 1, "Hello")], ["a", "b", "c"]) result = t.window(Tumble.over(expr.row_interval(2)).on("a").alias("w"))\ .group_by(expr.col('w'), expr.col('c')).select(t.b.sum) query_operation = result._j_table.getQueryOperation().getChildren().get(0) self.assertEqual('[c]', query_operation.getGroupingExpressions().toString()) self.assertEqual('TumbleWindow(field: [a], size: [2])', query_operation.getGroupWindow().asSummaryString())
def test_expressions(self): expr1 = col('a') expr2 = col('b') expr3 = col('c') self.assertEqual('10', str(lit(10, DataTypes.INT(False)))) self.assertEqual('rangeTo(1, 2)', str(range_(1, 2))) self.assertEqual('and(a, b, c)', str(and_(expr1, expr2, expr3))) self.assertEqual('or(a, b, c)', str(or_(expr1, expr2, expr3))) from pyflink.table.expressions import UNBOUNDED_ROW, UNBOUNDED_RANGE, CURRENT_ROW, \ CURRENT_RANGE self.assertEqual('unboundedRow()', str(UNBOUNDED_ROW)) self.assertEqual('unboundedRange()', str(UNBOUNDED_RANGE)) self.assertEqual('currentRow()', str(CURRENT_ROW)) self.assertEqual('currentRange()', str(CURRENT_RANGE)) self.assertEqual('currentDate()', str(current_date())) self.assertEqual('currentTime()', str(current_time())) self.assertEqual('currentTimestamp()', str(current_timestamp())) self.assertEqual('localTime()', str(local_time())) self.assertEqual('localTimestamp()', str(local_timestamp())) self.assertEquals('toTimestampLtz(123, 0)', str(to_timestamp_ltz(123, 0))) self.assertEqual("temporalOverlaps(cast('2:55:00', TIME(0)), 3600000, " "cast('3:30:00', TIME(0)), 7200000)", str(temporal_overlaps( lit("2:55:00").to_time, lit(1).hours, lit("3:30:00").to_time, lit(2).hours))) self.assertEqual("dateFormat(time, '%Y, %d %M')", str(date_format(col("time"), "%Y, %d %M"))) self.assertEqual("timestampDiff(DAY, cast('2016-06-15', DATE), cast('2016-06-18', DATE))", str(timestamp_diff( TimePointUnit.DAY, lit("2016-06-15").to_date, lit("2016-06-18").to_date))) self.assertEqual('array(1, 2, 3)', str(array(1, 2, 3))) self.assertEqual("row('key1', 1)", str(row("key1", 1))) self.assertEqual("map('key1', 1, 'key2', 2, 'key3', 3)", str(map_("key1", 1, "key2", 2, "key3", 3))) self.assertEqual('4', str(row_interval(4))) self.assertEqual('pi()', str(pi())) self.assertEqual('e()', str(e())) self.assertEqual('rand(4)', str(rand(4))) self.assertEqual('randInteger(4)', str(rand_integer(4))) self.assertEqual('atan2(1, 2)', str(atan2(1, 2))) self.assertEqual('minusPrefix(a)', str(negative(expr1))) self.assertEqual('concat(a, b, c)', str(concat(expr1, expr2, expr3))) self.assertEqual("concat_ws(', ', b, c)", str(concat_ws(', ', expr2, expr3))) self.assertEqual('uuid()', str(uuid())) self.assertEqual('null', str(null_of(DataTypes.BIGINT()))) self.assertEqual('log(a)', str(log(expr1))) self.assertEqual('ifThenElse(a, b, c)', str(if_then_else(expr1, expr2, expr3))) self.assertEqual('withColumns(a, b, c)', str(with_columns(expr1, expr2, expr3))) self.assertEqual('a.b.c(a)', str(call('a.b.c', expr1)))
def test_over_window(self): t_env = self.t_env t = t_env.from_elements([(1, 1, "Hello")], ['a', 'b', 'c']) result = t.over_window( Over.partition_by("c").order_by("a").preceding( expr.row_interval(2)).following(expr.CURRENT_ROW).alias("w")) self.assertRaisesRegex(Py4JJavaError, "Ordering must be defined on a time attribute", result.select, "b.sum over w")
def tumble_window_demo(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source with watermark definition ds = env.from_collection( collection=[ (Instant.of_epoch_milli(1000), 'Alice', 110.1), (Instant.of_epoch_milli(4000), 'Bob', 30.2), (Instant.of_epoch_milli(3000), 'Alice', 20.0), (Instant.of_epoch_milli(2000), 'Bob', 53.1), (Instant.of_epoch_milli(5000), 'Alice', 13.1), (Instant.of_epoch_milli(3000), 'Bob', 3.1), (Instant.of_epoch_milli(7000), 'Bob', 16.1), (Instant.of_epoch_milli(10000), 'Alice', 20.1) ], type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()])) table = t_env.from_data_stream( ds, Schema.new_builder() .column_by_expression("ts", "CAST(f0 AS TIMESTAMP(3))") .column("f1", DataTypes.STRING()) .column("f2", DataTypes.FLOAT()) .watermark("ts", "ts - INTERVAL '3' SECOND") .build() ).alias("ts", "name", "price") # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print') .schema(Schema.new_builder() .column('name', DataTypes.STRING()) .column('total_price', DataTypes.FLOAT()) .build()) .build()) # define the over window operation table = table.over_window( Over.partition_by(col("name")) .order_by(col("ts")) .preceding(row_interval(2)) .following(CURRENT_ROW) .alias('w')) \ .select(table.name, table.price.max.over(col('w'))) # submit for execution table.execute_insert('sink') \ .wait()