def test_union_all(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING] data = [(1, "Hi", "Hello"), (2, "Hi", "Hello"), (3, "Hello", "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) source_path2 = os.path.join(self.tempdir + '/streaming2.csv') data2 = [(2, "Hi", "Hello"), (3, "Hello", "Python"), (4, "Hi", "Flink")] csv_source2 = self.prepare_csv_source(source_path2, data2, field_types, field_names) t_env = self.t_env t_env.register_table_source("Source1", csv_source) t_env.register_table_source("Source2", csv_source2) source1 = t_env.scan("Source1") source2 = t_env.scan("Source2") field_names = ["a", "b", "c"] field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING] t_env.register_table_sink( "Results", field_names, field_types, source_sink_utils.TestAppendSink()) result = source1.union_all(source2) result.insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['1,Hi,Hello', '2,Hi,Hello', '2,Hi,Hello', '3,Hello,Hello', '3,Hello,Python', '4,Hi,Flink'] self.assert_equals(actual, expected)
def test_select(self): source_path = os.path.join(self.tempdir + '/streaming.csv') with open(source_path, 'w') as f: lines = '1,hi,hello\n' + '2,hi,hello\n' f.write(lines) f.close() field_names = ["a", "b", "c"] field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()] t_env = self.t_env # register Orders table in table environment t_env.register_table_source( "Orders", CsvTableSource(source_path, field_names, field_types)) t_env.register_table_sink( "Results", field_names, field_types, source_sink_utils.TestAppendSink()) t_env.scan("Orders") \ .select("a + 1, b, c") \ .insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['2,hi,hello', '3,hi,hello'] self.assert_equals(actual, expected)
def test_left_outer_join_without_where(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, "Hi", "Hello"), (2, "Hi", "Hello"), (3, "Hello", "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) source_path2 = os.path.join(self.tempdir + '/streaming2.csv') field_names2 = ["d", "e"] field_types2 = [DataTypes.INT(), DataTypes.STRING()] data2 = [(2, "Flink"), (3, "Python"), (3, "Flink")] csv_source2 = self.prepare_csv_source(source_path2, data2, field_types2, field_names2) t_env = self.t_env t_env.register_table_source("Source1", csv_source) t_env.register_table_source("Source2", csv_source2) source1 = t_env.scan("Source1") source2 = t_env.scan("Source2") field_names = ["a", "b"] field_types = [DataTypes.INT(), DataTypes.STRING()] t_env.register_table_sink( "Results", field_names, field_types, source_sink_utils.TestRetractSink()) result = source1.left_outer_join(source2, "a = d").select("a, b + e") result.insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['1,null', '2,HiFlink', '3,HelloPython', '3,HelloFlink'] self.assert_equals(actual, expected)
def test_open(self): self.t_env.get_config().get_configuration().set_string( 'python.metric.enabled', 'true') self.t_env.register_function( "subtract", udf(Subtract(), result_type=DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 4)], ['a', 'b']) t.select("a, subtract(b)").insert_into("Results") self.t_env.execute("test") actual = source_sink_utils.results() self.assert_equals(actual, ["1,1", "2,4", "3,3"])
def test_overwrite_builtin_function(self): self.t_env.register_function( "plus", udf(lambda i, j: i + j - 1, result_type=DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink(['a'], [DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) t.select("plus(a, b)").insert_into("Results") self.t_env.execute("test") actual = source_sink_utils.results() self.assert_equals(actual, ["2", "6", "3"])
def test_overwrite_builtin_function(self): self.t_env.create_temporary_system_function( "plus", udf(lambda i, j: i + j - 1, result_type=DataTypes.BIGINT())) sink_table_ddl = """ CREATE TABLE Results(a BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) t.select(t.a + t.b).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[2]", "+I[6]", "+I[3]"])
def test_udf_in_join_condition_2(self): t1 = self.t_env.from_elements([(1, "Hi"), (2, "Hi")], ['a', 'b']) t2 = self.t_env.from_elements([(2, "Flink")], ['c', 'd']) self.t_env.create_temporary_system_function("f", udf(lambda i: i, result_type=DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd'], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.BIGINT(), DataTypes.STRING()]) self.t_env.register_table_sink("Results", table_sink) exec_insert_table(t1.join(t2).where("f(a) = f(c)"), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["2,Hi,2,Flink"])
def test_sql_query(self): t_env = self.t_env source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"]) field_names = ["a", "b", "c"] field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()] t_env.register_table_sink( "sinks", source_sink_utils.TestAppendSink(field_names, field_types)) result = t_env.sql_query("select a + 1, b, c from %s" % source) result.execute_insert("sinks").wait() actual = source_sink_utils.results() expected = ['+I[2, Hi, Hello]', '+I[3, Hello, Hello]'] self.assert_equals(actual, expected)
def test_sql_update(self): t_env = self.t_env source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"]) field_names = ["a", "b", "c"] field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()] t_env.register_table_sink( "sinks", source_sink_utils.TestAppendSink(field_names, field_types)) t_env.sql_update("insert into sinks select * from %s" % source) self.t_env.execute("test_sql_job") actual = source_sink_utils.results() expected = ['+I[1, Hi, Hello]', '+I[2, Hello, Hello]'] self.assert_equals(actual, expected)
def test_udf_without_arguments(self): self.t_env.register_function("one", udf( lambda: 1, input_types=[], result_type=DataTypes.BIGINT(), deterministic=True)) self.t_env.register_function("two", udf( lambda: 2, input_types=[], result_type=DataTypes.BIGINT(), deterministic=False)) table_sink = source_sink_utils.TestAppendSink(['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select("one(), two()").insert_into("Results") self.t_env.execute("test") actual = source_sink_utils.results() self.assert_equals(actual, ["1,2", "1,2", "1,2"])
def test_udf_in_join_condition(self): t1 = self.t_env.from_elements([(2, "Hi")], ['a', 'b']) t2 = self.t_env.from_elements([(2, "Flink")], ['c', 'd']) self.t_env.register_function("f", udf(lambda i: i, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd'], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.BIGINT(), DataTypes.STRING()]) self.t_env.register_table_sink("Results", table_sink) t1.join(t2).where("f(a) = c").insert_into("Results") self.t_env.execute("test") actual = source_sink_utils.results() self.assert_equals(actual, ["2,Hi,2,Flink"])
def test_map_with_pandas_udf(self): t = self.t_env.from_elements( [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)), (2, Row(3, 4))], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.ROW([DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("d", DataTypes.INT())]))])) sink_table_ddl = """ CREATE TABLE Results(a BIGINT, b BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) def func(x): import pandas as pd res = pd.concat([x.a, x.c + x.d], axis=1) return res def func2(x): return x * 2 def func3(x): assert isinstance(x, Row) return x pandas_udf = udf(func, result_type=DataTypes.ROW( [DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT())]), func_type='pandas') pandas_udf_2 = udf(func2, result_type=DataTypes.ROW( [DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT())]), func_type='pandas') general_udf = udf(func3, result_type=DataTypes.ROW( [DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT())])) t.map(pandas_udf).map(pandas_udf_2).map(general_udf).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals( actual, ["+I[4, 8]", "+I[2, 10]", "+I[2, 28]", "+I[2, 18]", "+I[4, 14]"])
def test_scalar_function(self): # test lambda function self.t_env.register_function( "add_one", udf(lambda i: i + 1, DataTypes.BIGINT(), DataTypes.BIGINT())) # test Python ScalarFunction self.t_env.register_function( "subtract_one", udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT())) # test Python function self.t_env.register_function("add", add) # test callable function self.t_env.register_function( "add_one_callable", udf(CallablePlus(), DataTypes.BIGINT(), DataTypes.BIGINT())) def partial_func(col, param): return col + param # test partial function import functools self.t_env.register_function( "add_one_partial", udf(functools.partial(partial_func, param=1), DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd', 'e'], [ DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT() ]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) t.where("add_one(b) <= 3") \ .select("add_one(a), subtract_one(b), add(a, c), add_one_callable(a), " "add_one_partial(a)") \ .insert_into("Results") self.t_env.execute("test") actual = source_sink_utils.results() self.assert_equals(actual, ["2,1,4,2,2", "4,0,12,4,4"])
def test_scalar_function(self): # test metric disabled. self.t_env.get_config().get_configuration().set_string( 'python.metric.enabled', 'false') # test lambda function add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT()) # test Python ScalarFunction subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT()) # test callable function add_one_callable = udf(CallablePlus(), result_type=DataTypes.BIGINT()) def partial_func(col, param): return col + param # test partial function import functools add_one_partial = udf(functools.partial(partial_func, param=1), result_type=DataTypes.BIGINT()) # check memory limit is set @udf(result_type=DataTypes.BIGINT()) def check_memory_limit(): assert os.environ['_PYTHON_WORKER_MEMORY_LIMIT'] is not None return 1 table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd', 'e', 'f', 'g'], [ DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT() ]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) t.where(add_one(t.b) <= 3).select( add_one(t.a), subtract_one(t.b), add(t.a, t.c), add_one_callable(t.a), add_one_partial(t.a), check_memory_limit(), t.a) \ .execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals( actual, ["+I[2, 1, 4, 2, 2, 1, 1]", "+I[4, 0, 12, 4, 4, 1, 3]"])
def test_map_with_pandas_udf(self): t = self.t_env.from_elements( [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)), (2, Row(3, 4))], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.ROW([DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("d", DataTypes.INT())]))])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) def func(x): import pandas as pd res = pd.concat([x.a, x.c + x.d], axis=1) return res def func2(x): return x * 2 def func3(x): assert isinstance(x, Row) return x pandas_udf = udf(func, result_type=DataTypes.ROW( [DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT())]), func_type='pandas') pandas_udf_2 = udf(func2, result_type=DataTypes.ROW( [DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT())]), func_type='pandas') general_udf = udf(func3, result_type=DataTypes.ROW( [DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT())])) t.map(pandas_udf).map(pandas_udf_2).map(general_udf).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals( actual, ["+I[4, 8]", "+I[2, 10]", "+I[2, 28]", "+I[2, 18]", "+I[4, 14]"])
def test_sliding_group_window_over_count(self): self.t_env.get_config().get_configuration().set_string( "parallelism.default", "1") # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00', '3,3,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_sliding_group_window_over_count.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.register_function("my_sum", SumAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, protime as PROCTIME() ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils table_sink = source_sink_utils.TestAppendSink( ['a', 'd'], [DataTypes.TINYINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Slide.over("2.rows").every("1.rows").on("protime").alias("w")) \ .group_by("a, w") \ .select("a, my_sum(c) as b") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 5]", "+I[1, 11]", "+I[2, 4]", "+I[3, 5]"])
def test_chaining_scalar_function(self): add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT()) subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT()) sink_table_ddl = """ CREATE TABLE Results(a BIGINT, b BIGINT, c INT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c']) t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)) \ .execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[3, 1, 1]", "+I[7, 2, 1]", "+I[4, 3, 1]"])
def test_filter(self): t_env = self.t_env t = t_env.from_elements([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hello')], ['a', 'b', 'c']) field_names = ["a", "b", "c"] field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()] t_env.register_table_sink( "Results", field_names, field_types, source_sink_utils.TestAppendSink()) result = t.filter("a > 1 && b = 'Hello'") result.insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['2,Hello,Hello'] self.assert_equals(actual, expected)
def test_drop_columns(self): t_env = self.t_env t = t_env.from_elements([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hello')], ['a', 'b', 'c']) field_names = ["b"] field_types = [DataTypes.STRING()] t_env.register_table_sink("Results", field_names, field_types, source_sink_utils.TestAppendSink()) result = t.select("a, b, c").drop_columns("a, c").select("b") result.insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['Hi', 'Hello'] self.assert_equals(actual, expected)
def test_add_or_replace_columns(self): t_env = self.t_env t = t_env.from_elements([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hello')], ['a', 'b', 'c']) field_names = ["b", "a"] field_types = [DataTypes.BIGINT(), DataTypes.BIGINT()] t_env.register_table_sink("Results", field_names, field_types, source_sink_utils.TestAppendSink()) result = t.select("a").add_or_replace_columns("a + 1 as b, a + 2 as a") result.insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['3,2', '4,3'] self.assert_equals(actual, expected)
def test_java_transformer(self): t_env = MLEnvironmentFactory().get_default( ).get_stream_table_environment() table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) t_env.register_table_sink("TransformerResults", table_sink) source_table = t_env.from_elements([(1, 2, 3, 4), (4, 3, 2, 1)], ['a', 'b', 'c', 'd']) transformer = WrapperTransformer(selected_cols=["a", "b"]) exec_insert_table(transformer.transform(t_env, source_table), "TransformerResults") actual = source_sink_utils.results() self.assert_equals(actual, ["1,2", "4,3"])
def test_distinct(self): t_env = self.t_env t = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello"), (2, "Hello", "Hello")], ['a', 'b', 'c']) field_names = ["a", "b"] field_types = [DataTypes.BIGINT(), DataTypes.STRING()] t_env.register_table_sink("Results", field_names, field_types, source_sink_utils.TestRetractSink()) result = t.distinct().select("a, c as b") result.insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['1,Hello', '2,Hello'] self.assert_equals(actual, expected)
def test_chaining_scalar_function(self): self.t_env.create_temporary_system_function( "add_one", udf(lambda i: i + 1, result_type=DataTypes.BIGINT())) self.t_env.create_temporary_system_function( "subtract_one", udf(SubtractOne(), result_type=DataTypes.BIGINT())) self.t_env.create_temporary_system_function("add", add) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.INT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c']) exec_insert_table(t.select("add(add_one(a), subtract_one(b)), c, 1"), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["3,1,1", "7,2,1", "4,3,1"])
def test_from_pandas(self): table = self.t_env.from_pandas(self.pdf, self.data_type, 5) self.assertEqual(self.data_type, table.get_schema().to_row_data_type()) table = table.filter(table.f2 < 2) table_sink = source_sink_utils.TestAppendSink( self.data_type.field_names(), self.data_type.field_types()) self.t_env.register_table_sink("Results", table_sink) table.execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "1,1,1,1,true,1.1,1.2,hello,[97, 97, 97]," "1000000000000000000.010000000000000000,2014-09-13,01:00:01," "1970-01-01 00:00:00.123,[hello, 中文],1,hello," "1970-01-01 00:00:00.123,[1, 2]" ])
def test_select(self): t_env = self.t_env t = t_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c']) field_names = ["a", "b", "c"] field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()] t_env.register_table_sink( "Results", field_names, field_types, source_sink_utils.TestAppendSink()) t.select("a + 1, b, c") \ .insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['2,hi,hello', '3,hi,hello'] self.assert_equals(actual, expected)
def test_chaining_scalar_function(self): self.t_env.register_function( "add_one", udf(lambda i: i + 1, DataTypes.BIGINT(), DataTypes.BIGINT())) self.t_env.register_function( "subtract_one", udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT())) self.t_env.register_function("add", add) table_sink = source_sink_utils.TestAppendSink(['a'], [DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select("add(add_one(a), subtract_one(b))") \ .insert_into("Results") self.t_env.execute("test") actual = source_sink_utils.results() self.assert_equals(actual, ["3", "7", "4"])
def test_group_by(self): t_env = self.t_env t = t_env.from_elements([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hello'), (2, 'Hello', 'Hello')], ['a', 'b', 'c']) field_names = ["a", "b"] field_types = [DataTypes.BIGINT(), DataTypes.STRING()] t_env.register_table_sink("Results", field_names, field_types, source_sink_utils.TestRetractSink()) result = t.group_by("c").select("a.sum, c as b") result.insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['5,Hello'] self.assert_equals(actual, expected)
def test_sliding_group_window_over_count(self): self.t_env.get_config().set("parallelism.default", "1") # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00', '3,3,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_sliding_group_window_over_count.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.register_function("my_sum", SumAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, protime as PROCTIME() ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils sink_table_ddl = """ CREATE TABLE Results(a TINYINT, d BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) t.window(Slide.over(row_interval(2)).every(row_interval(1)).on(t.protime).alias("w")) \ .group_by(t.a, col("w")) \ .select(t.a, call("my_sum", t.c).alias("b")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 5]", "+I[1, 11]", "+I[2, 4]", "+I[3, 5]"])
def test_chaining_scalar_function(self): add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT()) subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT()) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.INT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c']) t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)) \ .execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["3,1,1", "7,2,1", "4,3,1"])
def test_window_aggregate_with_pandas_udaf(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [ DataTypes.TIMESTAMP(3), DataTypes.FLOAT(), DataTypes.INT() ]) self.t_env.register_table_sink("Results", table_sink) pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT())]), func_type="pandas") tumble_window = Tumble.over(expr.lit(1).hours) \ .on(expr.col("rowtime")) \ .alias("w") t.select(t.b, t.rowtime) \ .window(tumble_window) \ .group_by("w") \ .aggregate(pandas_udaf.alias("d", "e")) \ .select("w.rowtime, d, e") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["2018-03-11 03:59:59.999,2.2,3", "2018-03-11 04:59:59.999,8.0,8"])
def test_set_environment(self): python_exec = sys.executable tmp_dir = self.tempdir python_exec_link_path = os.path.join(tmp_dir, "py_exec") os.symlink(python_exec, python_exec_link_path) self.t_env.get_config().set_python_executable(python_exec_link_path) def check_python_exec(i): import os assert os.environ["python"] == python_exec_link_path return i self.t_env.create_temporary_system_function( "check_python_exec", udf(check_python_exec, DataTypes.BIGINT(), DataTypes.BIGINT())) def check_pyflink_gateway_disabled(i): try: from pyflink.java_gateway import get_gateway get_gateway() except Exception as e: assert str(e).startswith( "It's launching the PythonGatewayServer during Python UDF" " execution which is unexpected.") else: raise Exception("The gateway server is not disabled!") return i self.t_env._remote_mode = True self.t_env.create_temporary_system_function( "check_pyflink_gateway_disabled", udf(check_pyflink_gateway_disabled, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select( expr.call('check_python_exec', t.a), expr.call('check_pyflink_gateway_disabled', t.a)) \ .execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 1]", "+I[2, 2]", "+I[3, 3]"])
def test_udf_without_arguments(self): one = udf(lambda: 1, result_type=DataTypes.BIGINT(), deterministic=True) two = udf(lambda: 2, result_type=DataTypes.BIGINT(), deterministic=False) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) exec_insert_table(t.select(one(), two()), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["1,2", "1,2", "1,2"])
def test_udf_in_join_condition(self): t1 = self.t_env.from_elements([(2, "Hi")], ['a', 'b']) t2 = self.t_env.from_elements([(2, "Flink")], ['c', 'd']) f = udf(lambda i: i, result_type=DataTypes.BIGINT()) table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c', 'd'], [ DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.BIGINT(), DataTypes.STRING() ]) self.t_env.register_table_sink("Results", table_sink) exec_insert_table(t1.join(t2).where(f(t1.a) == t2.c), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["2,Hi,2,Flink"])
def test_sql_update(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("source", csv_source) t_env.register_table_sink( "sinks", field_names, field_types, source_sink_utils.TestAppendSink()) t_env.sql_update("insert into sinks select * from source") t_env.execute("test_sql_job") actual = source_sink_utils.results() expected = ['1,Hi,Hello', '2,Hello,Hello'] self.assert_equals(actual, expected)
def test_from_table_source(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, "Hi", "Hello"), (2, "Hi", "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_sink( "Sinks", field_names, field_types, source_sink_utils.TestAppendSink()) source = t_env.from_table_source(csv_source) source.insert_into("Sinks") t_env.execute() actual = source_sink_utils.results() expected = ['1,Hi,Hello', '2,Hi,Hello'] self.assert_equals(actual, expected)
def test_sql_query(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("Source", csv_source) source = t_env.scan("Source") t_env.register_table_sink( "sinks", field_names, field_types, source_sink_utils.TestAppendSink()) result = t_env.sql_query("select a + 1, b, c from %s" % source) result.insert_into("sinks") t_env.execute() actual = source_sink_utils.results() expected = ['2,Hi,Hello', '3,Hello,Hello'] self.assert_equals(actual, expected)
def test_filter(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("Source", csv_source) source = t_env.scan("Source") t_env.register_table_sink( "Results", field_names, field_types, source_sink_utils.TestAppendSink()) result = source.filter("a > 1 && b = 'Hello'") result.insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['2,Hello,Hello'] self.assert_equals(actual, expected)
def test_rename_columns(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("Source", csv_source) source = t_env.scan("Source") field_names = ["d", "e", "f"] t_env.register_table_sink( "Results", field_names, field_types, source_sink_utils.TestAppendSink()) result = source.select("a, b, c").rename_columns("a as d, c as f, b as e").select("d, e, f") result.insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['1,Hi,Hello', '2,Hello,Hello'] self.assert_equals(actual, expected)
def test_add_columns(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("Source", csv_source) source = t_env.scan("Source") field_types = [DataTypes.INT, DataTypes.INT, DataTypes.INT] t_env.register_table_sink( "Results", field_names, field_types, source_sink_utils.TestAppendSink()) result = source.select("a").add_columns("a + 1 as b, a + 2 as c") result.insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['1,2,3', '2,3,4'] self.assert_equals(actual, expected)
def test_sql_update_with_query_config(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("source", csv_source) t_env.register_table_sink( "sinks", field_names, field_types, source_sink_utils.TestAppendSink()) query_config = t_env.query_config() query_config.with_idle_state_retention_time( datetime.timedelta(days=1), datetime.timedelta(days=2)) t_env.sql_update("insert into sinks select * from source", query_config) t_env.execute("test_sql_job") actual = source_sink_utils.results() expected = ['1,Hi,Hello', '2,Hello,Hello'] self.assert_equals(actual, expected)
def test_distinct(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello"), (2, "Hello", "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("Source", csv_source) source = t_env.scan("Source") field_names = ["a", "b"] field_types = [DataTypes.INT(), DataTypes.STRING()] t_env.register_table_sink( "Results", field_names, field_types, source_sink_utils.TestRetractSink()) result = source.distinct().select("a, c as b") result.insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['1,Hello', '2,Hello'] self.assert_equals(actual, expected)