def alias_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_alias_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) st_env.register_table_sink( "result", CsvTableSink(["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ], result_file)) orders = st_env.scan("Orders") result = orders.alias("x, y, z, t").select("x, y, z, t") result.insert_into("result") st_env.execute("alias streaming")
def group_by_agg(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_group_by_agg.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source("Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP()])) bt_env.register_table_sink("result", CsvTableSink(["a", "b"], [DataTypes.STRING(), DataTypes.INT()], result_file)) orders = bt_env.scan("Orders") result = orders.group_by("a").select("a, b.sum as d") result.insert_into("result") bt_env.execute("group by agg") with open(result_file, 'r') as f: print(f.read())
def group_by_window_agg_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_group_by_window_agg_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a", "start", "end", "rowtime", "d"], [ DataTypes.STRING(), DataTypes.TIMESTAMP(), DataTypes.TIMESTAMP(), DataTypes.TIMESTAMP(), DataTypes.INT() ], result_file)) orders = bt_env.scan("Orders") result = orders.window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("a, w") \ .select("a, w.start, w.end, w.rowtime, b.sum as d") result.insert_into("result") bt_env.execute("group by agg batch")
def select_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_select_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) st_env.register_table_sink( "result", CsvTableSink(["a", "c"], [DataTypes.STRING(), DataTypes.INT()], result_file)) orders = st_env.scan("Orders") result = orders.select("a, b") result.insert_into("result") st_env.execute("select streaming")
def where_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = os.getcwd() + "/../result/table_where_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ], result_file)) orders = bt_env.scan("Orders") result = orders.where("a === 'b'") result.insert_into("result") bt_env.execute("where batch")
def add_columns_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_add_columns_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a", "b", "c", "rowtime", "d"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP(), DataTypes.STRING() ], result_file)) orders = bt_env.scan("Orders") result = orders.add_columns("concat(a, '_sunny') as d") result.insert_into("result") bt_env.execute("add columns batch")
def test_select(self): source_path = os.path.join(self.tempdir + '/streaming.csv') with open(source_path, 'w') as f: lines = '1,hi,hello\n' + '2,hi,hello\n' f.write(lines) f.close() field_names = ["a", "b", "c"] field_types = [ DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING() ] t_env = self.t_env # register Orders table in table environment t_env.register_table_source( "Orders", CsvTableSource(source_path, field_names, field_types)) t_env.register_table_sink("Results", field_names, field_types, source_sink_utils.TestAppendSink()) t_env.scan("Orders") \ .select("a + 1, b, c") \ .insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['2,hi,hello', '3,hi,hello'] self.assert_equals(actual, expected)
def group_by_agg_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" st_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) st_env.connect( Elasticsearch() .version("6") .host("localhost", 9200, "http") .index("group_by_agg_streaming") .document_type('pyflink') .key_delimiter("_") .key_null_literal("null") .failure_handler_ignore() .disable_flush_on_checkpoint() .bulk_flush_max_actions(2) .bulk_flush_max_size("1 mb") .bulk_flush_interval(5000) ) \ .with_schema( Schema() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) ) \ .with_format( Json() .derive_schema() ) \ .in_upsert_mode() \ .register_table_sink("result") orders = st_env.scan("Orders") groub_by_table = orders.group_by("a").select("a, b.sum as d") # Because the schema of index user in elasticsearch is # {"a":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}, # "b":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}} # so we need to cast the type in our demo. st_env.register_table("group_table", groub_by_table) result = st_env.sql_query("SELECT a, CAST(d AS VARCHAR) from group_table") result.insert_into("result") st_env.execute("group by agg streaming")
def test_get_execution_plan(self): tmp_dir = tempfile.gettempdir() source_path = os.path.join(tmp_dir + '/streaming.csv') tmp_csv = os.path.join(tmp_dir + '/streaming2.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] t_env = BatchTableEnvironment.create(self.env) csv_source = CsvTableSource(source_path, field_names, field_types) t_env.register_table_source("Orders", csv_source) t_env.register_table_sink( "Results", CsvTableSink(field_names, field_types, tmp_csv)) t_env.scan("Orders").insert_into("Results") plan = self.env.get_execution_plan() json.loads(plan)
def distinct_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" st_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) orders = st_env.scan("Orders") result = orders.select("a, b").distinct() # use custom retract sink connector sink = TestRetractSink(["a", "b"], [DataTypes.STRING(), DataTypes.INT()]) st_env.register_table_sink("sink", sink) result.insert_into("sink") st_env.execute("distinct streaming")
def slide_time_window_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_slide_time_window_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a"], [DataTypes.INT()], result_file)) orders = bt_env.scan("Orders") result = orders.window(Slide.over("60.minutes").every("10.minutes").on("rowtime").alias("w")) \ .group_by("w").select("b.sum") result.insert_into("result") bt_env.execute("slide time window batch")
def distinct_agg_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_distinct_agg_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["b"], [DataTypes.INT()], result_file)) orders = bt_env.scan("Orders") result = orders.group_by("a") \ .select("b.sum.distinct as d") result.insert_into("result") bt_env.execute("distinct agg batch")