def test_from_data_stream(self): self.env.set_parallelism(1) ds = self.env.from_collection([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) t_env = self.t_env table = t_env.from_data_stream(ds) field_names = ['a', 'b', 'c'] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] t_env.register_table_sink("Sink", source_sink_utils.TestAppendSink(field_names, field_types)) t_env.insert_into("Sink", table) t_env.execute("test_from_data_stream") result = source_sink_utils.results() expected = ['+I[1, Hi, Hello]', '+I[2, Hello, Hi]'] self.assert_equals(result, expected) ds = ds.map(lambda x: x, Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) table = t_env.from_data_stream(ds, col('a'), col('b'), col('c')) t_env.register_table_sink("ExprSink", source_sink_utils.TestAppendSink(field_names, field_types)) t_env.insert_into("ExprSink", table) t_env.execute("test_from_data_stream_with_expr") result = source_sink_utils.results() self.assert_equals(result, expected)
def test_sliding_group_window_over_proctime(self): self.t_env.get_config().set("parallelism.default", "1") from pyflink.table.window import Slide self.t_env.register_function("mean_udaf", mean_udaf) source_table = """ create table source_table( a INT, proctime as PROCTIME() ) with( 'connector' = 'datagen', 'rows-per-second' = '1', 'fields.a.kind' = 'sequence', 'fields.a.start' = '1', 'fields.a.end' = '10' ) """ self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") iterator = t.select(t.a, t.proctime) \ .window(Slide.over(lit(1).seconds) .every(lit(1).seconds) .on(t.proctime) .alias("w")) \ .group_by(t.a, col("w")) \ .select(mean_udaf(t.a).alias("b"), col("w").start).execute().collect() result = [i for i in iterator] # if the WindowAssigner.isEventTime() does not return false, # the w.start would be 1970-01-01 # TODO: After fixing the TimeZone problem of window with processing time (will be fixed in # FLIP-162), we should replace it with a more accurate assertion. self.assertTrue(result[0][1].year > 1970)
def test_tumble_group_window_aggregate_function(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) sink_table_ddl = """ CREATE TABLE Results(a TIMESTAMP(3), b TIMESTAMP(3), c FLOAT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) tumble_window = Tumble.over(lit(1).hours) \ .on(col("rowtime")) \ .alias("w") t.window(tumble_window) \ .group_by(col("w")) \ .select(col("w").start, col("w").end, mean_udaf(t.b)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[2018-03-11T03:00, 2018-03-11T04:00, 2.2]", "+I[2018-03-11T04:00, 2018-03-11T05:00, 8.0]"])
def test_table_function(self): self._register_table_sink( ['a', 'b', 'c'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT()]) multi_emit = udtf( MultiEmit(), result_types=[DataTypes.BIGINT(), DataTypes.BIGINT()]) multi_num = udf(MultiNum(), result_type=DataTypes.BIGINT()) t = self.t_env.from_elements([(1, 1, 3), (2, 1, 6), (3, 2, 9)], ['a', 'b', 'c']) t = t.join_lateral( multi_emit((t.a + t.a) / 2, multi_num(t.b)).alias('x', 'y')) t = t.left_outer_join_lateral(condition_multi_emit(t.x, t.y).alias('m')) \ .select(t.x, t.y, col("m")) t = t.left_outer_join_lateral(identity(t.m).alias('n')) \ .select(t.x, t.y, col("n")) actual = self._get_output(t) self.assert_equals(actual, [ "+I[1, 0, null]", "+I[1, 1, null]", "+I[2, 0, null]", "+I[2, 1, null]", "+I[3, 0, 0]", "+I[3, 0, 1]", "+I[3, 0, 2]", "+I[3, 1, 1]", "+I[3, 1, 2]", "+I[3, 2, 2]", "+I[3, 3, null]" ])
def test_tumble_window(self): t = self.t_env.from_elements([(1, 1, "Hello")], ["a", "b", "c"]) result = t.window(Tumble.over(expr.row_interval(2)).on("a").alias("w"))\ .group_by(expr.col('w'), expr.col('c')).select(t.b.sum) query_operation = result._j_table.getQueryOperation().getChildren().get(0) self.assertEqual('[c]', query_operation.getGroupingExpressions().toString()) self.assertEqual('TumbleWindow(field: [a], size: [2])', query_operation.getGroupWindow().asSummaryString())
def test_session_window(self): t = self.t_env.from_elements([(1000, 1, "Hello")], ["a", "b", "c"]) result = t.window(Session.with_gap(expr.lit(1).seconds).on("a").alias("w"))\ .group_by(expr.col('w'), expr.col('c')).select(t.b.sum) query_operation = result._j_table.getQueryOperation().getChildren().get(0) self.assertEqual('[c]', query_operation.getGroupingExpressions().toString()) self.assertEqual('SessionWindow(field: [a], gap: [1000])', query_operation.getGroupWindow().asSummaryString())
def test_session_group_window_over_time(self): # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_session_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.register_function("my_count", CountAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd'], [ DataTypes.TINYINT(), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Session.with_gap(lit(30).minutes).on(t.rowtime).alias("w")) \ .group_by(t.a, t.b, col("w")) \ .select(t.a, col("w").start, col("w").end, call("my_count", t.c).alias("c")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[3, 2018-03-11 03:10:00.0, 2018-03-11 03:40:00.0, 1]", "+I[2, 2018-03-11 03:10:00.0, 2018-03-11 04:00:00.0, 2]", "+I[1, 2018-03-11 03:10:00.0, 2018-03-11 04:10:00.0, 2]", "+I[1, 2018-03-11 04:20:00.0, 2018-03-11 04:50:00.0, 1]"])
def test_expressions(self): expr1 = col('a') expr2 = col('b') expr3 = col('c') self.assertEqual('10', str(lit(10, DataTypes.INT(False)))) self.assertEqual('rangeTo(1, 2)', str(range_(1, 2))) self.assertEqual('and(a, b, c)', str(and_(expr1, expr2, expr3))) self.assertEqual('or(a, b, c)', str(or_(expr1, expr2, expr3))) from pyflink.table.expressions import UNBOUNDED_ROW, UNBOUNDED_RANGE, CURRENT_ROW, \ CURRENT_RANGE self.assertEqual('unboundedRow()', str(UNBOUNDED_ROW)) self.assertEqual('unboundedRange()', str(UNBOUNDED_RANGE)) self.assertEqual('currentRow()', str(CURRENT_ROW)) self.assertEqual('currentRange()', str(CURRENT_RANGE)) self.assertEqual('currentDate()', str(current_date())) self.assertEqual('currentTime()', str(current_time())) self.assertEqual('currentTimestamp()', str(current_timestamp())) self.assertEqual('localTime()', str(local_time())) self.assertEqual('localTimestamp()', str(local_timestamp())) self.assertEquals('toTimestampLtz(123, 0)', str(to_timestamp_ltz(123, 0))) self.assertEqual("temporalOverlaps(cast('2:55:00', TIME(0)), 3600000, " "cast('3:30:00', TIME(0)), 7200000)", str(temporal_overlaps( lit("2:55:00").to_time, lit(1).hours, lit("3:30:00").to_time, lit(2).hours))) self.assertEqual("dateFormat(time, '%Y, %d %M')", str(date_format(col("time"), "%Y, %d %M"))) self.assertEqual("timestampDiff(DAY, cast('2016-06-15', DATE), cast('2016-06-18', DATE))", str(timestamp_diff( TimePointUnit.DAY, lit("2016-06-15").to_date, lit("2016-06-18").to_date))) self.assertEqual('array(1, 2, 3)', str(array(1, 2, 3))) self.assertEqual("row('key1', 1)", str(row("key1", 1))) self.assertEqual("map('key1', 1, 'key2', 2, 'key3', 3)", str(map_("key1", 1, "key2", 2, "key3", 3))) self.assertEqual('4', str(row_interval(4))) self.assertEqual('pi()', str(pi())) self.assertEqual('e()', str(e())) self.assertEqual('rand(4)', str(rand(4))) self.assertEqual('randInteger(4)', str(rand_integer(4))) self.assertEqual('atan2(1, 2)', str(atan2(1, 2))) self.assertEqual('minusPrefix(a)', str(negative(expr1))) self.assertEqual('concat(a, b, c)', str(concat(expr1, expr2, expr3))) self.assertEqual("concat_ws(', ', b, c)", str(concat_ws(', ', expr2, expr3))) self.assertEqual('uuid()', str(uuid())) self.assertEqual('null', str(null_of(DataTypes.BIGINT()))) self.assertEqual('log(a)', str(log(expr1))) self.assertEqual('ifThenElse(a, b, c)', str(if_then_else(expr1, expr2, expr3))) self.assertEqual('withColumns(a, b, c)', str(with_columns(expr1, expr2, expr3))) self.assertEqual('a.b.c(a)', str(call('a.b.c', expr1)))
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode()) # used to test pipeline.jars and pipeline.classpaths config_key = sys.argv[1] config_value = sys.argv[2] t_env.get_config().get_configuration().set_string(config_key, config_value) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT, `count_java` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.execute_sql(sink_ddl) t_env.execute_sql("create temporary system function add_one as 'add_one.add_one' language python") t_env.register_java_function("add_one_java", "org.apache.flink.python.tests.util.AddOne") elements = [(word, 0) for word in content.split(" ")] t = t_env.from_elements(elements, ["word", "count"]) t.select(t.word, call("add_one", t.count).alias("count"), call("add_one_java", t.count).alias("count_java")) \ .group_by(t.word) \ .select(t.word, col("count").count.alias("count"), col("count_java").count.alias("count_java")) \ .execute_insert("Results")
def demo01(): # environment configuration t_env = BatchTableEnvironment.create(environment_settings=EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build()) # register Orders table and Result table sink in table environment source_data_path = "/path/to/source/directory/" result_data_path = "/path/to/result/directory/" source_ddl = f""" create table Orders( a VARCHAR, b BIGINT, c BIGINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '1' SECOND ) with ( 'connector' = 'filesystem', 'format' = 'csv', 'path' = '{source_data_path}' ) """ t_env.execute_sql(source_ddl) sink_ddl = f""" create table `Result`( a VARCHAR, cnt BIGINT ) with ( 'connector' = 'filesystem', 'format' = 'csv', 'path' = '{result_data_path}' ) """ t_env.execute_sql(sink_ddl) # specify table program orders = t_env.from_path("Orders") # schema (a, b, c, rowtime) orders.group_by("a").select(orders.a, orders.b.count.alias('cnt')).execute_insert("result").wait() orders.where(orders.a == 'red') orders.filter(orders.b % 2 == 0) orders.add_columns(concat(orders.c, 'sunny')) orders.add_or_replace_columns(concat(orders.c, 'sunny').alias('desc')) orders.drop_columns(orders.b, orders.c) orders.rename_columns(orders.b.alias('b2'), orders.c.alias('c2')) orders.group_by(orders.a).select(orders.a, orders.b.sum.alias('d')) # tab.group_by(tab.key).select(tab.key, tab.value.avg.alias('average')) # tab.group_by("key").select("key, value.avg as average") result = orders.filter(orders.a.is_not_null & orders.b.is_not_null & orders.c.is_not_null) \ .select(orders.a.lower_case.alias('a'), orders.b, orders.rowtime) \ .window(Tumble.over(lit(1).hour).on(orders.rowtime).alias("hourly_window")) \ .group_by(col('hourly_window'), col('a')) \ .select(col('a'), col('hourly_window').end.alias('hour'), col('b').avg.alias('avg_billing_amount')) """
def test_over_window(self): t_env = self.t_env t = t_env.from_elements([(1, 1, "Hello")], ['a', 'b', 'c']) result = t.over_window( Over.partition_by(t.c).order_by(t.a).preceding( expr.row_interval(2)).following(expr.CURRENT_ROW).alias("w")) self.assertRaisesRegex(Py4JJavaError, "Ordering must be defined on a time attribute", result.select, expr.col("b").sum.over(expr.col("w")))
def word_count(input_path, output_path): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # write all the data to one file t_env.get_config().get_configuration().set_string("parallelism.default", "1") # define the source if input_path is not None: t_env.create_temporary_table( 'source', TableDescriptor.for_connector('filesystem').schema( Schema.new_builder().column( 'word', DataTypes.STRING()).build()).option( 'path', input_path).format('csv').build()) tab = t_env.from_path('source') else: print("Executing word_count example with default input data set.") print("Use --input to specify file input.") tab = t_env.from_elements( map(lambda i: (i, ), word_count_data), DataTypes.ROW([DataTypes.FIELD('line', DataTypes.STRING())])) # define the sink if output_path is not None: t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('filesystem').schema( Schema.new_builder().column('word', DataTypes.STRING()).column( 'count', DataTypes.BIGINT()).build()).option('path', output_path). format(FormatDescriptor.for_format('canal-json').build()).build()) else: print( "Printing result to stdout. Use --output to specify output path.") t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('word', DataTypes.STRING()).column( 'count', DataTypes.BIGINT()).build()).build()) @udtf(result_types=[DataTypes.STRING()]) def split(line: Row): for s in line[0].split(): yield Row(s) # compute word count tab.flat_map(split).alias('word') \ .group_by(col('word')) \ .select(col('word'), lit(1).count) \ .execute_insert('sink') \ .wait()
def test_aggregate_with_pandas_udaf(self): t = self.t_env.from_elements( [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()) ])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [DataTypes.TINYINT(), DataTypes.FLOAT(), DataTypes.INT()]) self.t_env.register_table_sink("Results", table_sink) pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.a.max()), result_type=DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT()) ]), func_type="pandas") t.select(t.a, t.b) \ .group_by(t.a) \ .aggregate(pandas_udaf) \ .select(expr.col("*")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 5.0, 1]", "+I[2, 2.0, 2]"])
def test_aggregate_with_pandas_udaf(self): t = self.t_env.from_elements( [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) sink_table_ddl = """ CREATE TABLE Results(a TINYINT, b FLOAT, c INT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.a.max()), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT())]), func_type="pandas") t.select(t.a, t.b) \ .group_by(t.a) \ .aggregate(pandas_udaf) \ .select(expr.col("*")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 5.0, 1]", "+I[2, 2.0, 2]"])
def test_window_aggregate_with_pandas_udaf(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [(1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3)) ])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [DataTypes.TIMESTAMP(3), DataTypes.FLOAT(), DataTypes.INT()]) self.t_env.register_table_sink("Results", table_sink) pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()), result_type=DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT()) ]), func_type="pandas") tumble_window = Tumble.over(expr.lit(1).hours) \ .on(expr.col("rowtime")) \ .alias("w") t.select(t.b, t.rowtime) \ .window(tumble_window) \ .group_by(expr.col("w")) \ .aggregate(pandas_udaf.alias("d", "e")) \ .select(expr.col("w").rowtime, expr.col("d"), expr.col("e")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "+I[2018-03-11 03:59:59.999, 2.2, 3]", "+I[2018-03-11 04:59:59.999, 8.0, 8]" ])
def _get_or_create_java_expression(expr: Union["Expression", str]): if isinstance(expr, Expression): return expr._j_expr elif isinstance(expr, str): from pyflink.table.expressions import col return col(expr)._j_expr else: raise TypeError( "Invalid argument: expected Expression or string, got {0}.".format(type(expr)))
def test_window_aggregate_with_pandas_udaf(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) sink_table_ddl = """ CREATE TABLE Results(a TIMESTAMP(3), b FLOAT, c INT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) print(t.get_schema()) pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT())]), func_type="pandas") tumble_window = Tumble.over(expr.lit(1).hours) \ .on(expr.col("rowtime")) \ .alias("w") t.select(t.b, t.rowtime) \ .window(tumble_window) \ .group_by(expr.col("w")) \ .aggregate(pandas_udaf.alias("d", "e")) \ .select(expr.col("w").rowtime, expr.col("d"), expr.col("e")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[2018-03-11 03:59:59.999, 2.2, 3]", "+I[2018-03-11 04:59:59.999, 8.0, 8]"])
def mixing_use_of_datastream_and_table(): # use StreamTableEnvironment instead of TableEnvironment when mixing use of table & datastream env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source t_env.create_temporary_table( 'source', TableDescriptor.for_connector('datagen').schema( Schema.new_builder().column('id', DataTypes.BIGINT()).column( 'data', DataTypes.STRING()).build()).option("number-of-rows", "10").build()) # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('a', DataTypes.BIGINT()).build()).build()) @udf(result_type=DataTypes.BIGINT()) def length(data): return len(data) # perform table api operations table = t_env.from_path("source") table = table.select(col('id'), length(col('data'))) # convert table to datastream and perform datastream api operations ds = t_env.to_data_stream(table) ds = ds.map(lambda i: i[0] + i[1], output_type=Types.LONG()) # convert datastream to table and perform table api operations as you want table = t_env.from_data_stream( ds, Schema.new_builder().column("f0", DataTypes.BIGINT()).build()) # execute table.execute_insert('sink') \ .wait()