def test_check_result_type(self): def pandas_udaf(): pass with self.assertRaises( TypeError, msg="Invalid returnType: Pandas UDAF doesn't support DataType type MAP currently"): udaf(pandas_udaf, result_type=DataTypes.MAP(DataTypes.INT(), DataTypes.INT()), func_type="pandas")
def test_map_view(self): my_count = udaf(CountDistinctAggregateFunction()) self.t_env.get_config().set_idle_state_retention( datetime.timedelta(days=1)) self.t_env.get_config().set("python.fn-execution.bundle.size", "2") # trigger the cache eviction in a bundle. self.t_env.get_config().set("python.state.cache-size", "1") self.t_env.get_config().set("python.map-state.read-cache-size", "1") self.t_env.get_config().set("python.map-state.write-cache-size", "1") t = self.t_env.from_elements([(1, 'Hi_', 'hi'), (1, 'Hi', 'hi'), (2, 'hello', 'hello'), (3, 'Hi_', 'hi'), (3, 'Hi', 'hi'), (4, 'hello', 'hello'), (5, 'Hi2_', 'hi'), (5, 'Hi2', 'hi'), (6, 'hello2', 'hello'), (7, 'Hi', 'hi'), (8, 'hello', 'hello'), (9, 'Hi2', 'hi'), (13, 'Hi3', 'hi')], ['a', 'b', 'c']) self.t_env.create_temporary_view("source", t) table_with_retract_message = self.t_env.sql_query( "select LAST_VALUE(b) as b, LAST_VALUE(c) as c from source group by a" ) result = table_with_retract_message.group_by(t.c).select( my_count(t.b).alias("a"), t.c) assert_frame_equal( result.to_pandas().sort_values('c').reset_index(drop=True), pd.DataFrame([[2, "hello"], [3, "hi"]], columns=['a', 'c']))
def test_map_view_iterate(self): test_iterate = udaf(TestIterateAggregateFunction()) self.t_env.get_config().set_idle_state_retention( datetime.timedelta(days=1)) self.t_env.get_config().set("python.fn-execution.bundle.size", "2") # trigger the cache eviction in a bundle. self.t_env.get_config().set("python.state.cache-size", "2") self.t_env.get_config().set("python.map-state.read-cache-size", "2") self.t_env.get_config().set("python.map-state.write-cache-size", "2") self.t_env.get_config().set( "python.map-state.iterate-response-batch-size", "2") t = self.t_env.from_elements([(1, 'Hi_', 'hi'), (1, 'Hi', 'hi'), (2, 'hello', 'hello'), (3, 'Hi_', 'hi'), (3, 'Hi', 'hi'), (4, 'hello', 'hello'), (5, 'Hi2_', 'hi'), (5, 'Hi2', 'hi'), (6, 'hello2', 'hello'), (7, 'Hi', 'hi'), (8, 'hello', 'hello'), (9, 'Hi2', 'hi'), (13, 'Hi3', 'hi')], ['a', 'b', 'c']) self.t_env.create_temporary_view("source", t) table_with_retract_message = self.t_env.sql_query( "select LAST_VALUE(b) as b, LAST_VALUE(c) as c from source group by a" ) result = table_with_retract_message.group_by(t.c) \ .select(test_iterate(t.b).alias("a"), t.c) \ .select(col("a").get(0).alias("a"), col("a").get(1).alias("b"), col("a").get(2).alias("c"), col("a").get(3).alias("d"), t.c.alias("e")) assert_frame_equal( result.to_pandas().sort_values('c').reset_index(drop=True), pd.DataFrame( [["Hi,Hi2,Hi3", "1,2,3", "Hi:3,Hi2:2,Hi3:1", 3, "hi"], ["hello,hello2", "1,3", 'hello:3,hello2:1', 2, "hello"]], columns=['a', 'b', 'c', 'd', 'e']))
def test_aggregate_with_pandas_udaf(self): t = self.t_env.from_elements( [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()) ])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [DataTypes.TINYINT(), DataTypes.FLOAT(), DataTypes.INT()]) self.t_env.register_table_sink("Results", table_sink) pandas_udaf = udaf(lambda a: (a.mean(), a.max()), result_type=DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT()) ]), func_type="pandas") t.group_by(t.a) \ .aggregate(pandas_udaf(t.b).alias("c", "d")) \ .select("a, c, d").execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["1,5.0,8", "2,2.0,3"])
def test_aggregate_with_pandas_udaf_without_keys(self): t = self.t_env.from_elements( [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()) ])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.FLOAT(), DataTypes.INT()]) self.t_env.register_table_sink("Results", table_sink) pandas_udaf = udaf(lambda pd: Row(pd.b.mean(), pd.b.max()), result_type=DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT()) ]), func_type="pandas") t.select(t.b) \ .aggregate(pandas_udaf.alias("a", "b")) \ .select("a, b") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[3.8, 8]"])
def test_proc_time_over_rows_window_aggregate_function(self): # create source file path import tempfile import os tmp_dir = tempfile.gettempdir() data = [ '1,1,2013-01-01 03:10:00', '3,2,2013-01-01 03:10:00', '2,1,2013-01-01 03:10:00', '1,5,2013-01-01 03:10:00', '1,8,2013-01-01 04:20:00', '2,3,2013-01-01 03:30:00' ] source_path = tmp_dir + '/test_over_rows_window_aggregate_function.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') max_add_min_udaf = udaf(lambda a: a.max() + a.min(), result_type=DataTypes.SMALLINT(), func_type='pandas') self.env.set_parallelism(1) self.env.set_stream_time_characteristic( TimeCharacteristic.ProcessingTime) self.t_env.register_function("mean_udaf", mean_udaf) self.t_env.register_function("max_add_min_udaf", max_add_min_udaf) source_table = """ create table source_table( a TINYINT, b SMALLINT, proctime as PROCTIME() ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [DataTypes.TINYINT(), DataTypes.FLOAT(), DataTypes.SMALLINT()]) self.t_env.register_table_sink("Results", table_sink) self.t_env.execute_sql(""" insert into Results select a, mean_udaf(b) over (PARTITION BY a ORDER BY proctime ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), max_add_min_udaf(b) over (PARTITION BY a ORDER BY proctime ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) from source_table """).wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "+I[1, 1.0, 2]", "+I[1, 3.0, 6]", "+I[1, 6.5, 13]", "+I[2, 1.0, 2]", "+I[2, 2.0, 4]", "+I[3, 2.0, 4]" ]) os.remove(source_path)
def test_group_aggregate_with_aux_group(self): t = self.t_env.from_elements( [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) sink_table_ddl = """ CREATE TABLE Results(a TINYINT, b INT, c FLOAT, d INT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) self.t_env.get_config().get_configuration().set_string('python.metric.enabled', 'true') self.t_env.get_config().set('python.metric.enabled', 'true') self.t_env.register_function("max_add", udaf(MaxAdd(), result_type=DataTypes.INT(), func_type="pandas")) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) t.group_by(t.a) \ .select(t.a, (t.a + 1).alias("b"), (t.a + 2).alias("c")) \ .group_by(t.a, t.b) \ .select(t.a, t.b, mean_udaf(t.b), call("max_add", t.b, t.c, 1)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 2, 2.0, 6]", "+I[2, 3, 3.0, 8]", "+I[3, 4, 4.0, 10]"])
def test_row_time_over_range_window_aggregate_function(self): # create source file path import tempfile import os tmp_dir = tempfile.gettempdir() data = [ '1,1,2013-01-01 03:10:00', '3,2,2013-01-01 03:10:00', '2,1,2013-01-01 03:10:00', '1,5,2013-01-01 03:10:00', '1,8,2013-01-01 04:20:00', '2,3,2013-01-01 03:30:00' ] source_path = tmp_dir + '/test_over_range_window_aggregate_function.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') max_add_min_udaf = udaf(lambda a: a.max() + a.min(), result_type=DataTypes.SMALLINT(), func_type='pandas') self.env.set_stream_time_characteristic(TimeCharacteristic.EventTime) self.t_env.register_function("mean_udaf", mean_udaf) self.t_env.register_function("max_add_min_udaf", max_add_min_udaf) source_table = """ create table source_table( a TINYINT, b SMALLINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [ DataTypes.TINYINT(), DataTypes.FLOAT(), DataTypes.SMALLINT()]) self.t_env.register_table_sink("Results", table_sink) self.t_env.execute_sql(""" insert into Results select a, mean_udaf(b) over (PARTITION BY a ORDER BY rowtime RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND CURRENT ROW), max_add_min_udaf(b) over (PARTITION BY a ORDER BY rowtime RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND CURRENT ROW) from source_table """).wait() actual = source_sink_utils.results() self.assert_equals(actual, ["1,3.0,6", "1,3.0,6", "1,8.0,16", "2,1.0,2", "2,2.0,4", "3,2.0,4"]) os.remove(source_path)
def test_group_aggregate_function(self): t = self.t_env.from_elements( [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) sink_table_ddl = """ CREATE TABLE Results(a TINYINT, b FLOAT,c ROW<a INT, b INT>) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) # general udf add = udf(lambda a: a + 1, result_type=DataTypes.INT()) # pandas udf substract = udf(lambda a: a - 1, result_type=DataTypes.INT(), func_type="pandas") max_udaf = udaf(lambda a: (a.max(), a.min()), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD("b", DataTypes.INT())]), func_type="pandas") t.group_by(t.a) \ .select(t.a, mean_udaf(add(t.b)), max_udaf(substract(t.c))) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals( actual, ["+I[1, 6.0, +I[5, 2]]", "+I[2, 3.0, +I[3, 2]]", "+I[3, 3.0, +I[2, 2]]"])
def test_group_aggregate_function(self): t = self.t_env.from_elements( [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [DataTypes.TINYINT(), DataTypes.FLOAT(), DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD("b", DataTypes.INT())])]) self.t_env.register_table_sink("Results", table_sink) # general udf add = udf(lambda a: a + 1, result_type=DataTypes.INT()) # pandas udf substract = udf(lambda a: a - 1, result_type=DataTypes.INT(), func_type="pandas") max_udaf = udaf(lambda a: (a.max(), a.min()), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD("b", DataTypes.INT())]), func_type="pandas") t.group_by(t.a) \ .select(t.a, mean_udaf(add(t.b)), max_udaf(substract(t.c))) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals( actual, ["+I[1, 6.0, +I[5, 2]]", "+I[2, 3.0, +I[3, 2]]", "+I[3, 3.0, +I[2, 2]]"])
def test_aggregate_with_pandas_udaf(self): t = self.t_env.from_elements( [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) sink_table_ddl = """ CREATE TABLE Results(a TINYINT, b FLOAT, c INT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.a.max()), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT())]), func_type="pandas") t.select(t.a, t.b) \ .group_by(t.a) \ .aggregate(pandas_udaf) \ .select(expr.col("*")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 5.0, 1]", "+I[2, 2.0, 2]"])
def test_group_aggregate_with_aux_group(self): t = self.t_env.from_elements( [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd'], [DataTypes.TINYINT(), DataTypes.INT(), DataTypes.FLOAT(), DataTypes.INT()]) self.t_env.register_table_sink("Results", table_sink) self.t_env.get_config().set('python.metric.enabled', 'true') self.t_env.register_function("max_add", udaf(MaxAdd(), result_type=DataTypes.INT(), func_type="pandas")) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) t.group_by(t.a) \ .select(t.a, (t.a + 1).alias("b"), (t.a + 2).alias("c")) \ .group_by(t.a, t.b) \ .select(t.a, t.b, mean_udaf(t.b), call("max_add", t.b, t.c, 1)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 2, 2.0, 6]", "+I[2, 3, 3.0, 8]", "+I[3, 4, 4.0, 10]"])
def test_slide_group_window_aggregate_function(self): import datetime from pyflink.table.window import Slide t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd', 'e'], [ DataTypes.TINYINT(), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.FLOAT(), DataTypes.INT() ]) self.t_env.register_table_sink("Results", table_sink) self.t_env.register_function("max_add", udaf(MaxAdd(), result_type=DataTypes.INT(), func_type="pandas")) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) slide_window = Slide.over(lit(1).hours) \ .every(lit(30).minutes) \ .on(col("rowtime")) \ .alias("w") t.window(slide_window) \ .group_by(t.a, col("w")) \ .select(t.a, col("w").start, col("w").end, mean_udaf(t.b), call("max_add", t.b, t.c, 1)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2.0, 6]", "+I[1, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.5, 7]", "+I[1, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 5.5, 14]", "+I[1, 2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 8.0, 14]", "+I[2, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 1.0, 4]", "+I[2, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.0, 10]", "+I[2, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 3.0, 10]", "+I[3, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.0, 7]", "+I[3, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2.0, 7]"])
def test_using_decorator(self): my_count = udaf(CountAggregateFunction(), accumulator_type=DataTypes.ARRAY(DataTypes.INT()), result_type=DataTypes.INT()) t = self.t_env.from_elements([(1, 'Hi', 'Hello')], ['a', 'b', 'c']) result = t.group_by(t.c) \ .select(my_count(t.a).alias("a"), t.c.alias("b")) plan = result.explain() result_type = result.get_schema().get_field_data_type(0) self.assertTrue(plan.find("PythonGroupAggregate(groupBy=[c], ") >= 0) self.assertEqual(result_type, DataTypes.INT())
def process(self) -> None: calculate = udaf(CalculateAgg()) shop_agg = udaf(ShopAgg()) table = self.table_env.from_path("wish_product_data") result_table = table.group_by(table.pid)\ .select(calculate(table.timestamp, table.pid, table.merchant_id, table.merchant_name, table.shop_name, table.review_number, table.review_score, table.shop_review_number, table.title, table.is_pb, table.is_hwc, table.is_verified, table.total_bought, table.total_wishlist, table.tags, table.category_ids, table.category_paths, table.category_l1_ids, table.category_l2_ids, table.category_l3_ids, table.leaf_category_ids, table.price, table.shipping_price, table.sold, table.update_time, table.shop_open_time, table.gen_time, table.data_update_time)) #calculate(table.timestamp, table.update_time, table.sold, table.price, table.review_number)) shop_table = table.select(table.update_time, table.shop_name) shop_table = shop_table.add_columns("1 as count") shop_result_table = shop_table.group_by(shop_table.shop_name)\ .select(shop_table.shop_name, shop_agg(table.update_time, shop_table.count)) self.sink_to_kafka(self.wish_result_topic, result_table, self.result_type) self.sink_to_kafka(self.wish_shop_result_topic, shop_result_table, self.shop_result_type)
def test_data_view_clear(self): my_count = udaf(CountDistinctAggregateFunction()) self.t_env.get_config().set_idle_state_retention( datetime.timedelta(days=1)) self.t_env.get_config().set("python.fn-execution.bundle.size", "2") # trigger the cache eviction in a bundle. self.t_env.get_config().set("python.state.cache-size", "1") t = self.t_env.from_elements([(2, 'hello', 'hello'), (4, 'clear', 'hello'), (6, 'hello2', 'hello'), (8, 'hello', 'hello')], ['a', 'b', 'c']) result = t.group_by(t.c).select(my_count(t.b).alias("a"), t.c) assert_frame_equal(result.to_pandas(), pd.DataFrame([[2, "hello"]], columns=['a', 'c']))
def test_window_aggregate_with_pandas_udaf(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [ DataTypes.TIMESTAMP(3), DataTypes.FLOAT(), DataTypes.INT() ]) self.t_env.register_table_sink("Results", table_sink) pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT())]), func_type="pandas") tumble_window = Tumble.over(expr.lit(1).hours) \ .on(expr.col("rowtime")) \ .alias("w") t.select(t.b, t.rowtime) \ .window(tumble_window) \ .group_by("w") \ .aggregate(pandas_udaf.alias("d", "e")) \ .select("w.rowtime, d, e") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["2018-03-11 03:59:59.999,2.2,3", "2018-03-11 04:59:59.999,8.0,8"])
def test_list_view(self): my_concat = udaf(ListViewConcatAggregateFunction()) self.t_env.get_config().set("python.fn-execution.bundle.size", "2") # trigger the cache eviction in a bundle. self.t_env.get_config().set("python.state.cache-size", "2") t = self.t_env.from_elements([(1, 'Hi', 'Hello'), (3, 'Hi', 'hi'), (3, 'Hi2', 'hi'), (3, 'Hi', 'hi'), (2, 'Hi', 'Hello'), (1, 'Hi2', 'Hello'), (3, 'Hi3', 'hi'), (3, 'Hi2', 'Hello'), (3, 'Hi3', 'hi'), (2, 'Hi3', 'Hello')], ['a', 'b', 'c']) result = t.group_by(t.c).select(my_concat(t.b, ',').alias("a"), t.c) assert_frame_equal( result.to_pandas().sort_values('c').reset_index(drop=True), pd.DataFrame( [["Hi,Hi,Hi2,Hi2,Hi3", "Hello"], ["Hi,Hi2,Hi,Hi3,Hi3", "hi"]], columns=['a', 'c']))
def test_data_view(self): my_concat = udaf(ConcatAggregateFunction()) self.t_env.get_config().get_configuration().set_string( "python.fn-execution.bundle.size", "2") # trigger the cache eviction in a bundle. self.t_env.get_config().get_configuration().set_string( "python.state.cache.size", "2") t = self.t_env.from_elements([(1, 'Hi', 'Hello'), (3, 'Hi', 'hi'), (3, 'Hi2', 'hi'), (3, 'Hi', 'hi2'), (2, 'Hi', 'Hello')], ['a', 'b', 'c']) result = t.group_by(t.c).select(my_concat(t.b, ',').alias("a"), t.c) assert_frame_equal(result.to_pandas(), pd.DataFrame([["Hi,Hi2", "hi"], ["Hi", "hi2"], ["Hi,Hi", "Hello"]], columns=['a', 'c']))
def test_group_aggregate_without_keys(self): t = self.t_env.from_elements( [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) table_sink = source_sink_utils.TestAppendSink( ['a'], [DataTypes.INT()]) min_add = udaf(lambda a, b, c: a.min() + b.min() + c.min(), result_type=DataTypes.INT(), func_type="pandas") self.t_env.register_table_sink("Results", table_sink) t.select(min_add(t.a, t.b, t.c)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[5]"])
def test_window_aggregate_with_pandas_udaf(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) sink_table_ddl = """ CREATE TABLE Results(a TIMESTAMP(3), b FLOAT, c INT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) print(t.get_schema()) pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT())]), func_type="pandas") tumble_window = Tumble.over(expr.lit(1).hours) \ .on(expr.col("rowtime")) \ .alias("w") t.select(t.b, t.rowtime) \ .window(tumble_window) \ .group_by(expr.col("w")) \ .aggregate(pandas_udaf.alias("d", "e")) \ .select(expr.col("w").rowtime, expr.col("d"), expr.col("e")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[2018-03-11 03:59:59.999, 2.2, 3]", "+I[2018-03-11 04:59:59.999, 8.0, 8]"])
def test_group_aggregate_without_keys(self): t = self.t_env.from_elements( [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) sink_table_ddl = """ CREATE TABLE Results(a INT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) min_add = udaf(lambda a, b, c: a.min() + b.min() + c.min(), result_type=DataTypes.INT(), func_type="pandas") t.select(min_add(t.a, t.b, t.c)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[5]"])
def test_aggregate(self): import pandas as pd t = self.t_env.from_elements( [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) function = CountAndSumAggregateFunction() agg = udaf(function, result_type=function.get_result_type(), accumulator_type=function.get_accumulator_type(), name=str(function.__class__.__name__)) result = t.group_by(t.a) \ .aggregate(agg.alias("c", "d")) \ .select("a, c, d") \ .to_pandas() assert_frame_equal(result, pd.DataFrame([[1, 3, 15], [2, 2, 4]], columns=['a', 'c', 'd']))
def test_aggregate_with_pandas_udaf_without_keys(self): t = self.t_env.from_elements( [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) sink_table_ddl = """ CREATE TABLE Results(a FLOAT, b INT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) pandas_udaf = udaf(lambda pd: Row(pd.b.mean(), pd.b.max()), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT())]), func_type="pandas") t.select(t.b) \ .aggregate(pandas_udaf.alias("a", "b")) \ .select(t.a, t.b) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[3.8, 8]"])
def test_execute_over_aggregate_from_json_plan(self): # create source file path tmp_dir = self.tempdir data = [ '1,1,2013-01-01 03:10:00', '3,2,2013-01-01 03:10:00', '2,1,2013-01-01 03:10:00', '1,5,2013-01-01 03:10:00', '1,8,2013-01-01 04:20:00', '2,3,2013-01-01 03:30:00' ] source_path = tmp_dir + '/test_execute_over_aggregate_from_json_plan.csv' sink_path = tmp_dir + '/test_execute_over_aggregate_from_json_plan' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') source_table = """ CREATE TABLE source_table ( a TINYINT, b SMALLINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) WITH ( 'connector' = 'filesystem', 'path' = '%s', 'format' = 'csv' ) """ % source_path self.t_env.execute_sql(source_table) self.t_env.execute_sql(""" CREATE TABLE sink_table ( a TINYINT, b FLOAT, c SMALLINT ) WITH ( 'connector' = 'filesystem', 'path' = '%s', 'format' = 'csv' ) """ % sink_path) max_add_min_udaf = udaf(lambda a: a.max() + a.min(), result_type=DataTypes.SMALLINT(), func_type='pandas') self.t_env.get_config().set( "pipeline.time-characteristic", "EventTime") self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) self.t_env.create_temporary_system_function("max_add_min_udaf", max_add_min_udaf) json_plan = self.t_env._j_tenv.compilePlanSql(""" insert into sink_table select a, mean_udaf(b) over (PARTITION BY a ORDER BY rowtime ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), max_add_min_udaf(b) over (PARTITION BY a ORDER BY rowtime ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) from source_table """) from py4j.java_gateway import get_method get_method(self.t_env._j_tenv.executePlan(json_plan), "await")() import glob lines = [line.strip() for file in glob.glob(sink_path + '/*') for line in open(file, 'r')] lines.sort() self.assertEqual(lines, ['1,1.0,2', '1,3.0,6', '1,6.5,13', '2,1.0,2', '2,2.0,4', '3,2.0,4'])
def test_over_window_aggregate_function(self): import datetime t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 1, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], [DataTypes.TINYINT(), DataTypes.FLOAT(), DataTypes.INT(), DataTypes.FLOAT(), DataTypes.FLOAT(), DataTypes.FLOAT(), DataTypes.FLOAT(), DataTypes.FLOAT(), DataTypes.FLOAT(), DataTypes.FLOAT()]) self.t_env.register_table_sink("Results", table_sink) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) self.t_env.register_function("max_add", udaf(MaxAdd(), result_type=DataTypes.INT(), func_type="pandas")) self.t_env.register_table("T", t) self.t_env.execute_sql(""" insert into Results select a, mean_udaf(b) over (PARTITION BY a ORDER BY rowtime ROWS BETWEEN UNBOUNDED preceding AND UNBOUNDED FOLLOWING), max_add(b, c) over (PARTITION BY a ORDER BY rowtime ROWS BETWEEN UNBOUNDED preceding AND 0 FOLLOWING), mean_udaf(b) over (PARTITION BY a ORDER BY rowtime ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING), mean_udaf(c) over (PARTITION BY a ORDER BY rowtime ROWS BETWEEN 1 PRECEDING AND 0 FOLLOWING), mean_udaf(c) over (PARTITION BY a ORDER BY rowtime RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), mean_udaf(b) over (PARTITION BY a ORDER BY rowtime RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), mean_udaf(b) over (PARTITION BY a ORDER BY rowtime RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND UNBOUNDED FOLLOWING), mean_udaf(c) over (PARTITION BY a ORDER BY rowtime RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND UNBOUNDED FOLLOWING), mean_udaf(c) over (PARTITION BY a ORDER BY rowtime RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND CURRENT ROW) from T """).wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 4.3333335, 5, 4.3333335, 3.0, 3.0, 2.5, 4.3333335, 3.0, 2.0]", "+I[1, 4.3333335, 13, 5.5, 3.0, 3.0, 4.3333335, 8.0, 5.0, 5.0]", "+I[1, 4.3333335, 6, 4.3333335, 2.0, 3.0, 2.5, 4.3333335, 3.0, 2.0]", "+I[2, 2.0, 9, 2.0, 4.0, 4.0, 2.0, 2.0, 4.0, 4.0]", "+I[2, 2.0, 3, 2.0, 2.0, 4.0, 1.0, 2.0, 4.0, 2.0]", "+I[3, 2.0, 3, 2.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0]"])
def test_row_time_over_rows_window_aggregate_function(self): # create source file path import tempfile import os tmp_dir = tempfile.gettempdir() data = [ '1,1,2013-01-01 03:10:00', '3,2,2013-01-01 03:10:00', '2,1,2013-01-01 03:10:00', '1,5,2013-01-01 03:10:00', '1,8,2013-01-01 04:20:00', '2,3,2013-01-01 03:30:00' ] source_path = tmp_dir + '/test_over_rows_window_aggregate_function.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') max_add_min_udaf = udaf(lambda a: a.max() + a.min(), result_type=DataTypes.SMALLINT(), func_type='pandas') self.t_env.get_config().set( "pipeline.time-characteristic", "EventTime") self.t_env.register_function("mean_udaf", mean_udaf) self.t_env.register_function("max_add_min_udaf", max_add_min_udaf) source_table = """ create table source_table( a TINYINT, b SMALLINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) sink_table_ddl = """ CREATE TABLE Results(a TINYINT, b FLOAT, c SMALLINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) self.t_env.execute_sql(""" insert into Results select a, mean_udaf(b) over (PARTITION BY a ORDER BY rowtime ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), max_add_min_udaf(b) over (PARTITION BY a ORDER BY rowtime ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) from source_table """).wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 1.0, 2]", "+I[1, 3.0, 6]", "+I[1, 6.5, 13]", "+I[2, 1.0, 2]", "+I[2, 2.0, 4]", "+I[3, 2.0, 4]"]) os.remove(source_path)
def row_operations(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "China", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) # map operation @udf(result_type=DataTypes.ROW([ DataTypes.FIELD("id", DataTypes.BIGINT()), DataTypes.FIELD("country", DataTypes.STRING()) ])) def extract_country(input_row: Row): data = json.loads(input_row.data) return Row(input_row.id, data['addr']['country']) table.map(extract_country) \ .execute().print() # +----+----------------------+--------------------------------+ # | op | id | country | # +----+----------------------+--------------------------------+ # | +I | 1 | Germany | # | +I | 2 | China | # | +I | 3 | China | # | +I | 4 | China | # +----+----------------------+--------------------------------+ # flat_map operation @udtf(result_types=[DataTypes.BIGINT(), DataTypes.STRING()]) def extract_city(input_row: Row): data = json.loads(input_row.data) yield input_row.id, data['addr']['city'] table.flat_map(extract_city) \ .execute().print() # +----+----------------------+--------------------------------+ # | op | f0 | f1 | # +----+----------------------+--------------------------------+ # | +I | 1 | Berlin | # | +I | 2 | Shanghai | # | +I | 3 | NewYork | # | +I | 4 | Hangzhou | # +----+----------------------+--------------------------------+ # aggregate operation class CountAndSumAggregateFunction(AggregateFunction): def get_value(self, accumulator): return Row(accumulator[0], accumulator[1]) def create_accumulator(self): return Row(0, 0) def accumulate(self, accumulator, input_row): accumulator[0] += 1 accumulator[1] += int(input_row.tel) def retract(self, accumulator, input_row): accumulator[0] -= 1 accumulator[1] -= int(input_row.tel) def merge(self, accumulator, accumulators): for other_acc in accumulators: accumulator[0] += other_acc[0] accumulator[1] += other_acc[1] def get_accumulator_type(self): return DataTypes.ROW([ DataTypes.FIELD("cnt", DataTypes.BIGINT()), DataTypes.FIELD("sum", DataTypes.BIGINT()) ]) def get_result_type(self): return DataTypes.ROW([ DataTypes.FIELD("cnt", DataTypes.BIGINT()), DataTypes.FIELD("sum", DataTypes.BIGINT()) ]) count_sum = udaf(CountAndSumAggregateFunction()) table.add_columns( col('data').json_value('$.name', DataTypes.STRING()).alias('name'), col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'), col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \ .group_by(col('country')) \ .aggregate(count_sum.alias("cnt", "sum")) \ .select(col('country'), col('cnt'), col('sum')) \ .execute().print() # +----+--------------------------------+----------------------+----------------------+ # | op | country | cnt | sum | # +----+--------------------------------+----------------------+----------------------+ # | +I | China | 3 | 291 | # | +I | Germany | 1 | 123 | # +----+--------------------------------+----------------------+----------------------+ # flat_aggregate operation class Top2(TableAggregateFunction): def emit_value(self, accumulator): for v in accumulator: if v: yield Row(v) def create_accumulator(self): return [None, None] def accumulate(self, accumulator, input_row): tel = int(input_row.tel) if accumulator[0] is None or tel > accumulator[0]: accumulator[1] = accumulator[0] accumulator[0] = tel elif accumulator[1] is None or tel > accumulator[1]: accumulator[1] = tel def get_accumulator_type(self): return DataTypes.ARRAY(DataTypes.BIGINT()) def get_result_type(self): return DataTypes.ROW([DataTypes.FIELD("tel", DataTypes.BIGINT())]) top2 = udtaf(Top2()) table.add_columns( col('data').json_value('$.name', DataTypes.STRING()).alias('name'), col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'), col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \ .group_by(col('country')) \ .flat_aggregate(top2) \ .select(col('country'), col('tel')) \ .execute().print()
from pyflink.table import AggregateFunction, DataTypes from pyflink.table.udf import udaf class WeightedAvg(AggregateFunction): def create_accumulator(self): # Row(sum, count) return Row(0, 0) def get_value(self, accumulator: Row) -> float: if accumulator[1] == 0: return 0 else: return accumulator[0] / accumulator[1] def accumulate(self, accumulator: Row, value, weight): accumulator[0] += value * weight accumulator[1] += weight def retract(self, accumulator: Row, value, weight): accumulator[0] -= value * weight accumulator[1] -= weight weighted_avg = udaf(f=WeightedAvg(), result_type=DataTypes.DOUBLE(), accumulator_type=DataTypes.ROW([ DataTypes.FIELD("f0", DataTypes.BIGINT()), DataTypes.FIELD("f1", DataTypes.BIGINT()) ]))