def test_create_table_environment_with_blink_planner(self): t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance( ).use_blink_planner().build()) planner = t_env._j_tenv.getPlanner() self.assertEqual( planner.getClass().getName(), "org.apache.flink.table.planner.delegation.StreamPlanner") t_env = StreamTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance().build()) planner = t_env._j_tenv.getPlanner() self.assertEqual( planner.getClass().getName(), "org.apache.flink.table.planner.delegation.StreamPlanner") t_env = StreamTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).use_old_planner().build()) planner = t_env._j_tenv.getPlanner() self.assertEqual(planner.getClass().getName(), "org.apache.flink.table.planner.StreamPlanner")
def test_create_table_environment_with_old_planner(self): t_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance().in_batch_mode() .use_old_planner().build()) self.assertEqual( t_env._j_tenv.getClass().getName(), "org.apache.flink.table.api.bridge.java.internal.BatchTableEnvironmentImpl")
def main(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build() t_env = StreamTableEnvironment.create(env, environment_settings=environment_settings) t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.size", 300000) t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.time", 1000) t_env.get_config().get_configuration().set_boolean("pipeline.object-reuse", True) t_env.register_table_sink( "sink", PrintTableSink( ["id"], [DataTypes.INT(False)])) @udf(input_types=[DataTypes.INT(False)], result_type=DataTypes.INT(False)) def inc(x): return x + 1 t_env.register_function("inc", inc) t_env.register_java_function("java_inc", "com.alibaba.flink.function.JavaInc") num_rows = 100000000 t_env.from_table_source(RangeTableSource(1, num_rows, 1)).alias("id") \ .select("inc(id)") \ .insert_into("sink") beg_time = time.time() t_env.execute("Python UDF") print("PyFlink Python UDF inc() consume time: " + str(time.time() - beg_time))
def _local_execute_func(exec_func, write_func, pickle_func, python_path): table_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).use_blink_planner().in_batch_mode().build()) table_env.get_config().get_configuration().set_string( 'parallelism.default', '1') table_env.get_config().set_python_executable(python_path) table_env.register_function( exec_func, udf(lambda _: pickle_func, DataTypes.BIGINT(), DataTypes.STRING())) table_env.connect(FileSystem().path(write_func)) \ .with_format(OldCsv().field('func', DataTypes.STRING())) \ .with_schema(Schema().field('func', DataTypes.STRING())) \ .create_temporary_table(exec_func) table = table_env.from_elements([(1, 'Joblib')]) table.select('{}(_1)'.format(exec_func)).insert_into(exec_func) table_env.execute(exec_func) # decode execution result from table sink file. execute_result = cloudpickle.loads( codecs.decode( pd.DataFrame(pd.read_csv(write_func))[0:].columns[0].encode(), 'base64')) # remove table sink file to clear ineffective files. os.remove(write_func) return execute_result
def __init__(self): # self.feature_extractor = DemoFeatureExtractor() self.settings = EnvironmentSettings.new_instance().in_streaming_mode( ).use_blink_planner().build() self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(1) self.table_env = StreamTableEnvironment.create( self.env, environment_settings=self.settings) self.table_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) self.table_env.get_config().get_configuration().set_string( "python.fn-execution.buffer.memory.size", "1024mb") self.table_env.get_config().get_configuration().set_string( "parallelism.default", "3") self.table_env.get_config().get_configuration().set_string( "python.fn-execution.bundle.size", "5000") self.table_env.get_config().get_configuration().set_string( "restart-strategy", "fixed-delay") self.table_env.get_config().get_configuration().set_string( "restart-strategy.fixed-delay.attempts", "3") self.table_env.get_config().get_configuration().set_string( "restart-strategy.fixed-delay.delay", "30s") source_table = open('source.sql', 'r').read() sink_table = open('sink.sql', 'r').read() self.table_env.execute_sql(source_table) self.table_env.execute_sql(sink_table)
def test_table_environment_with_blink_planner(self): t_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).in_batch_mode().use_blink_planner().build()) source_path = os.path.join(self.tempdir + '/streaming.csv') sink_path = os.path.join(self.tempdir + '/results') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, 'hi', 'hello'), (2, 'hello', 'hello')] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env.register_table_source("source", csv_source) t_env.register_table_sink( "sink", CsvTableSink(field_names, field_types, sink_path)) source = t_env.scan("source") result = source.alias("a, b, c").select("1 + a, b, c") result.insert_into("sink") t_env.execute("blink_test") results = [] for root, dirs, files in os.walk(sink_path): for sub_file in files: with open(os.path.join(root, sub_file), 'r') as f: line = f.readline() while line is not None and line != '': results.append(line) line = f.readline() self.assert_equals(results, ['2,hi,hello\n', '3,hello,hello\n'])
def test_table_environment_with_blink_planner(self): self.env.set_parallelism(1) t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance( ).use_blink_planner().build()) source_path = os.path.join(self.tempdir + '/streaming.csv') sink_path = os.path.join(self.tempdir + '/result.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, 'hi', 'hello'), (2, 'hello', 'hello')] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env.register_table_source("source", csv_source) t_env.register_table_sink( "sink", CsvTableSink(field_names, field_types, sink_path)) source = t_env.scan("source") result = source.alias("a, b, c").select("1 + a, b, c") result.insert_into("sink") t_env.execute("blink_test") results = [] with open(sink_path, 'r') as f: results.append(f.readline()) results.append(f.readline()) self.assert_equals(results, ['2,hi,hello\n', '3,hello,hello\n'])
def init_env(self, **kwargs): env = StreamExecutionEnvironment.get_execution_environment() self.st_env = StreamTableEnvironment.create( env, environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build() ) return
def full_outer_join_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) left = st_env.from_elements([(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], ["a", "b", "c"]).select("a, b, c") right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], ["d", "e", "f"]).select("d, e, f") result = left.full_outer_join(right, "a = d").select("a, b, e") # use custom retract sink connector sink = TestRetractSink( ["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]) st_env.register_table_sink("sink", sink) result.insert_into("sink") st_env.execute("full outer join streaming")
def alias_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_alias_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) st_env.register_table_sink( "result", CsvTableSink(["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ], result_file)) orders = st_env.scan("Orders") result = orders.alias("x, y, z, t").select("x, y, z, t") result.insert_into("result") st_env.execute("alias streaming")
def test_to_configuration(self): expected_settings = EnvironmentSettings.new_instance().in_batch_mode( ).build() config = expected_settings.to_configuration() self.assertEqual("BATCH", config.get_string("execution.runtime-mode", "stream"))
def setUp(self): super(PyFlinkBlinkStreamTableTestCase, self).setUp() self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build())
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" env_settings = EnvironmentSettings.new_instance().in_batch_mode( ).use_blink_planner().build() t_env = TableEnvironment.create(environment_settings=env_settings) # used to test pipeline.jars and pipleline.classpaths config_key = sys.argv[1] config_value = sys.argv[2] t_env.get_config().get_configuration().set_string(config_key, config_value) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT, `count_java` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.execute_sql(sink_ddl) t_env.execute_sql( "create temporary system function add_one as 'add_one.add_one' language python" ) t_env.register_java_function("add_one_java", "org.apache.flink.python.tests.util.AddOne") elements = [(word, 0) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .select("word, add_one(count) as count, add_one_java(count) as count_java") \ .group_by("word") \ .select("word, count(count) as count, count(count_java) as count_java") \ .execute_insert("Results")
def test_planner_selection(self): gateway = get_gateway() CLASS_NAME = gateway.jvm.EnvironmentSettings.CLASS_NAME builder = EnvironmentSettings.new_instance() OLD_PLANNER_FACTORY = get_private_field(builder._j_builder, "OLD_PLANNER_FACTORY") OLD_EXECUTOR_FACTORY = get_private_field(builder._j_builder, "OLD_EXECUTOR_FACTORY") BLINK_PLANNER_FACTORY = get_private_field(builder._j_builder, "BLINK_PLANNER_FACTORY") BLINK_EXECUTOR_FACTORY = get_private_field(builder._j_builder, "BLINK_EXECUTOR_FACTORY") # test the default behaviour to make sure it is consistent with the python doc envrionment_settings = builder.build() self.assertEqual( envrionment_settings._j_environment_settings.toPlannerProperties() [CLASS_NAME], OLD_PLANNER_FACTORY) self.assertEqual( envrionment_settings._j_environment_settings.toExecutorProperties( )[CLASS_NAME], OLD_EXECUTOR_FACTORY) # test use_old_planner envrionment_settings = builder.use_old_planner().build() self.assertEqual( envrionment_settings._j_environment_settings.toPlannerProperties() [CLASS_NAME], OLD_PLANNER_FACTORY) self.assertEqual( envrionment_settings._j_environment_settings.toExecutorProperties( )[CLASS_NAME], OLD_EXECUTOR_FACTORY) # test use_blink_planner envrionment_settings = builder.use_blink_planner().build() self.assertEqual( envrionment_settings._j_environment_settings.toPlannerProperties() [CLASS_NAME], BLINK_PLANNER_FACTORY) self.assertEqual( envrionment_settings._j_environment_settings.toExecutorProperties( )[CLASS_NAME], BLINK_EXECUTOR_FACTORY) # test use_any_planner envrionment_settings = builder.use_any_planner().build() self.assertTrue(CLASS_NAME not in envrionment_settings. _j_environment_settings.toPlannerProperties()) self.assertTrue(CLASS_NAME not in envrionment_settings. _j_environment_settings.toExecutorProperties())
def test_blink_from_element(self): t_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).use_blink_planner().in_batch_mode().build()) field_names = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r" ] field_types = [ DataTypes.BIGINT(), DataTypes.DOUBLE(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.DATE(), DataTypes.TIME(), DataTypes.TIMESTAMP(), DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(), DataTypes.INTERVAL(DataTypes.DAY(), DataTypes.SECOND()), DataTypes.ARRAY(DataTypes.DOUBLE()), DataTypes.ARRAY(DataTypes.DOUBLE(False)), DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.DATE()), DataTypes.DECIMAL(10, 0), DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.DOUBLE()) ]), DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()), DataTypes.BYTES(), PythonOnlyUDT() ] schema = DataTypes.ROW( list( map( lambda field_name, field_type: DataTypes.FIELD( field_name, field_type), field_names, field_types))) table_sink = source_sink_utils.TestAppendSink(field_names, field_types) t_env.register_table_sink("Results", table_sink) t = t_env.from_elements( [(1, 1.0, "hi", "hello", datetime.date(1970, 1, 2), datetime.time(1, 0, 0), datetime.datetime( 1970, 1, 2, 0, 0), datetime.datetime(1970, 1, 2, 0, 0), datetime.timedelta(days=1, microseconds=10), [1.0, None], array.array("d", [1.0, 2.0]), ["abc"], [datetime.date(1970, 1, 2)], Decimal(1), Row("a", "b")(1, 2.0), { "key": 1.0 }, bytearray(b'ABCD'), PythonOnlyPoint(3.0, 4.0))], schema) t.insert_into("Results") t_env.execute("test") actual = source_sink_utils.results() expected = [ '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,' '1970-01-02 00:00:00.0,86400000,[1.0, null],[1.0, 2.0],[abc],[1970-01-02],' '1.000000000000000000,1,2.0,{key=1.0},[65, 66, 67, 68],[3.0, 4.0]' ] self.assert_equals(actual, expected)
def test_to_Configuration(self): expected_settings = \ EnvironmentSettings.new_instance().use_old_planner().in_batch_mode().build() config = expected_settings.to_configuration() self.assertEqual("OLD", config.get_string("table.planner", "blink")) self.assertEqual("BATCH", config.get_string("execution.runtime-mode", "stream"))
def test_planner_selection(self): builder = EnvironmentSettings.new_instance() # test the default behaviour to make sure it is consistent with the python doc environment_settings = builder.build() self.check_blink_planner(environment_settings) # test use_blink_planner environment_settings = EnvironmentSettings.new_instance( ).use_blink_planner().build() self.check_blink_planner(environment_settings) # test use_any_planner environment_settings = builder.use_any_planner().build() self.check_any_planner(environment_settings)
def group_by_agg_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" st_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) st_env.connect( Elasticsearch() .version("6") .host("localhost", 9200, "http") .index("group_by_agg_streaming") .document_type('pyflink') .key_delimiter("_") .key_null_literal("null") .failure_handler_ignore() .disable_flush_on_checkpoint() .bulk_flush_max_actions(2) .bulk_flush_max_size("1 mb") .bulk_flush_interval(5000) ) \ .with_schema( Schema() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) ) \ .with_format( Json() .derive_schema() ) \ .in_upsert_mode() \ .register_table_sink("result") orders = st_env.scan("Orders") groub_by_table = orders.group_by("a").select("a, b.sum as d") # Because the schema of index user in elasticsearch is # {"a":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}, # "b":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}} # so we need to cast the type in our demo. st_env.register_table("group_table", groub_by_table) result = st_env.sql_query("SELECT a, CAST(d AS VARCHAR) from group_table") result.insert_into("result") st_env.execute("group by agg streaming")
def create_table_env(self): exec_env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).in_batch_mode().use_blink_planner().build()) t_env._j_tenv.getPlanner().getExecEnv().setParallelism(1) statement_set = t_env.create_statement_set() t_env.get_config().set_python_executable('/usr/bin/python3') t_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) return exec_env, t_env, statement_set
def check_blink_planner(self, settings: EnvironmentSettings): gateway = get_gateway() CLASS_NAME = gateway.jvm.EnvironmentSettings.CLASS_NAME builder = EnvironmentSettings.new_instance() BLINK_PLANNER_FACTORY = get_private_field(builder._j_builder, "BLINK_PLANNER_FACTORY") self.assertEqual( settings._j_environment_settings.toPlannerProperties()[CLASS_NAME], BLINK_PLANNER_FACTORY)
def create_table_env(self): stream_env = StreamExecutionEnvironment.get_execution_environment() stream_env.set_parallelism(1) t_env = StreamTableEnvironment.create( stream_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) statement_set = t_env.create_statement_set() t_env.get_config().set_python_executable('/usr/bin/python3') t_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) return stream_env, t_env, statement_set
def hello_world(): """ 从随机Source读取数据,然后直接利用PrintSink输出。 """ settings = EnvironmentSettings.new_instance().in_streaming_mode( ).use_blink_planner().build() env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=settings) t_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) source_ddl = """ CREATE TABLE random_source ( f_sequence INT, f_random INT, f_random_str STRING ) WITH ( 'connector' = 'datagen', 'rows-per-second'='5', 'fields.f_sequence.kind'='sequence', 'fields.f_sequence.start'='1', 'fields.f_sequence.end'='1000', 'fields.f_random.min'='1', 'fields.f_random.max'='1000', 'fields.f_random_str.length'='10' ) """ sink_ddl = """ CREATE TABLE print_sink ( f_sequence INT, f_random INT, f_random_str STRING ) WITH ( 'connector' = 'print' ) """ # 注册source和sink t_env.execute_sql(source_ddl) t_env.execute_sql(sink_ddl) # 注册 UDF t_env.register_function('pass_by', pass_by) # 数据提取 tab = t_env.from_path("random_source") # 这里我们暂时先使用 标注了 deprecated 的API, 因为新的异步提交测试有待改进... tab.select("f_sequence, f_random, pass_by(f_random_str) ").insert_into( "print_sink") # 执行作业 t_env.execute("Flink Hello World")
def main(): env = StreamExecutionEnvironment.get_execution_environment() env.add_jars("file:///app/src/kafka-clients-2.8.0.jar") env.add_jars("file:///app/src/flink-connector-kafka_2.12-1.12.3.jar") env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) env.enable_checkpointing(60000, CheckpointingMode.EXACTLY_ONCE) config = env.get_checkpoint_config() config.enable_externalized_checkpoints( ExternalizedCheckpointCleanup.DELETE_ON_CANCELLATION) st_env = StreamTableEnvironment.create( env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) print("register kafka source") register_kafka_source(st_env) print("register transaction sinks") register_transactions_sink_into_csv(st_env) st_env.from_path("source_tbl") \ .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \ .group_by(col("w")) \ .select("""count(message) as total, w.end as end_time """) \ .insert_into("total_sink") st_env.from_path("source_tbl") \ .where("message = 'dolorem'") \ .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \ .group_by(col("w")) \ .select(""" count(message) as total, w.end as end_time """) \ .insert_into("grep_sink") st_env.from_path("source_tbl") \ .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \ .group_by(col("w"), col("message")) \ .select(""" count(message) as total, message, w.end as end_time """) \ .insert_into("topk_sink") st_env.execute("app")
def load(token): # 获取交易日期维度数 pro = ts.pro_api(token) df = pro.query( 'stock_basic', list_status='L', fields= 'ts_code,symbol,name,area,industry,market,curr_type,list_date,is_hs') # 创建flink程序的入口 env_settings = EnvironmentSettings.new_instance().in_batch_mode( ).use_blink_planner().build() table_env = BatchTableEnvironment.create(environment_settings=env_settings) # 将pandas的dataframe转换成 table,并通过创建视图的方式赋予别称 table = table_env.from_pandas(df) table_env.create_temporary_view("stock_info", table) # 声明输出的 sink_ddl = """ -- register a MySQL table 'users' in Flink SQL create table Results( ts_code STRING, symbol STRING, name STRING, area STRING, industry STRING, market STRING, curr_type STRING, list_date STRING, is_hs STRING ) with ( 'connector' = 'jdbc', 'url' = 'jdbc:mysql://localhost:3306/shares?useUnicode=yes&characterEncoding=UTF-8&useSSL=false', 'table-name' = 'dim_stock', 'username' = 'root', 'password' = '123456' ) """ table_env.execute_sql(sink_ddl) # 使用jdbc方式需要额外添加java的jar table_env.get_config().get_configuration().set_string( "pipeline.jars", "file:///home/wy/shares/mysql-connector-java-5.1.49.jar;file:///home/wy/shares/flink-connector-jdbc_2.12-1.12.2.jar" ) # mini模式运行的时候需要调用wait 等待 程序运行完成 table_env.execute_sql( "insert into Results select * from stock_info").wait()
def test_add_python_file(self): import uuid python_file_dir = os.path.join(self.tempdir, "python_file_dir_" + str(uuid.uuid4())) os.mkdir(python_file_dir) python_file_path = os.path.join(python_file_dir, "test_dep1.py") with open(python_file_path, 'w') as f: f.write("def add_two(a):\n return a + 2") def plus_two_map(value): from test_dep1 import add_two return add_two(value) get_j_env_configuration(self.env._j_stream_execution_environment).\ setString("taskmanager.numberOfTaskSlots", "10") self.env.add_python_file(python_file_path) ds = self.env.from_collection([1, 2, 3, 4, 5]) ds = ds.map(plus_two_map, Types.LONG()) \ .slot_sharing_group("data_stream") \ .map(lambda i: i, Types.LONG()) \ .slot_sharing_group("table") python_file_path = os.path.join(python_file_dir, "test_dep2.py") with open(python_file_path, 'w') as f: f.write("def add_three(a):\n return a + 3") def plus_three(value): from test_dep2 import add_three return add_three(value) t_env = StreamTableEnvironment.create( stream_execution_environment=self.env, environment_settings=EnvironmentSettings.new_instance().use_blink_planner().build()) self.env.add_python_file(python_file_path) from pyflink.table.udf import udf from pyflink.table.expressions import col add_three = udf(plus_three, result_type=DataTypes.BIGINT()) tab = t_env.from_data_stream(ds, 'a') \ .select(add_three(col('a'))) t_env.to_append_stream(tab, Types.ROW([Types.LONG()])) \ .map(lambda i: i[0]) \ .add_sink(self.test_sink) self.env.execute("test add_python_file") result = self.test_sink.get_results(True) expected = ['6', '7', '8', '9', '10'] result.sort() expected.sort() self.assertEqual(expected, result)
def kafka_to_mysql(): """ 从Kafka Source读取Json数据,然后导入到Mysql。{"msg": "welcome flink users..."} cp """ settings = EnvironmentSettings.new_instance().in_streaming_mode( ).use_blink_planner().build() env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=settings) t_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) source_ddl = """ CREATE TABLE kafka_source ( msg STRING ) WITH ( 'connector' = 'kafka-0.11', 'topic' = 'cdn-log', 'properties.bootstrap.servers' = 'kafka:9092', 'format' = 'json', 'scan.startup.mode' = 'latest-offset' ) """ sink_ddl = """ CREATE TABLE mysql_sink ( msg STRING ) WITH ( 'connector' = 'jdbc', 'url' = 'jdbc:mysql://mysql:3306/flinkdb?characterEncoding=utf-8&useSSL=false', 'table-name' = 'cdn_log', 'username' = 'root', 'password' = '123456', 'sink.buffer-flush.max-rows' = '1' ) """ # 注册source和sink t_env.execute_sql(source_ddl) t_env.execute_sql(sink_ddl) # 数据提取 tab = t_env.from_path("kafka_source") # 这里我们暂时先使用 标注了 deprecated 的API, 因为新的异步提交测试有待改进... tab.insert_into("mysql_sink") # 执行作业 t_env.execute("kafka_to_mysql")
def test_stream(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build() t_env = StreamTableEnvironment.create(env, environment_settings=environment_settings) # t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.size", 1000000) # t_env.get_config().get_configuration().set_boolean("table.exec.mini-batch.enabled", True) # t_env.get_config().get_configuration().set_integer("table.exec.mini-batch.allow-latency", 1000) # t_env.get_config().get_configuration().set_integer("table.exec.mini-batch.size", 100000) t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.time", 1000) t_env.get_config().get_configuration().set_boolean("pipeline.object-reuse", True) t_env.create_temporary_function("python_avg", MeanAggregateFunction()) t_env.create_java_temporary_system_function("java_avg", "com.alibaba.flink.function.JavaAvg") num_rows = 10000000 t_env.execute_sql(f""" CREATE TABLE source ( id INT, num INT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) WITH ( 'connector' = 'Range', 'start' = '1', 'end' = '{num_rows}', 'step' = '1', 'partition' = '200' ) """) t_env.register_table_sink( "sink", PrintTableSink( ["num", "value"], [DataTypes.INT(False), DataTypes.FLOAT(False)], 1000000)) # .group_by("num") \ # .select("num % 1000 as num, id") \ result = t_env.from_path("source") \ .select("num % 1000 as num, id") \ .group_by("num") \ .select("num, python_avg(id)") result.insert_into("sink") beg_time = time.time() t_env.execute("Python UDF") print("PyFlink stream group agg consume time: " + str(time.time() - beg_time))
def test_to_append_stream(self): self.env.set_parallelism(1) t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance().use_blink_planner().build()) table = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hi")], ["a", "b", "c"]) new_table = table.select("a + 1, b + 'flink', c") ds = t_env.to_append_stream(table=new_table, type_info=Types.ROW([Types.LONG(), Types.STRING(), Types.STRING()])) test_sink = DataStreamTestSinkFunction() ds.add_sink(test_sink) self.env.execute("test_to_append_stream") result = test_sink.get_results(False) expected = ['+I[2, Hiflink, Hello]', '+I[3, Helloflink, Hi]'] self.assertEqual(result, expected)
def test_to_retract_stream(self): self.env.set_parallelism(1) t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance().use_blink_planner().build()) table = t_env.from_elements([(1, "Hi", "Hello"), (1, "Hi", "Hello")], ["a", "b", "c"]) new_table = table.group_by("c").select("a.sum, c as b") ds = t_env.to_retract_stream(table=new_table, type_info=Types.ROW([Types.LONG(), Types.STRING()])) test_sink = DataStreamTestSinkFunction() ds.map(lambda x: x).add_sink(test_sink) self.env.execute("test_to_retract_stream") result = test_sink.get_results(True) expected = ["(True, Row(f0=1, f1='Hello'))", "(False, Row(f0=1, f1='Hello'))", "(True, Row(f0=2, f1='Hello'))"] self.assertEqual(result, expected)
def __init__(self): # self.feature_extractor = DemoFeatureExtractor() self.settings = EnvironmentSettings.new_instance().in_streaming_mode( ).use_blink_planner().build() self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(1) self.table_env = StreamTableEnvironment.create( self.env, environment_settings=self.settings) self.table_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) self.table_env.add_python_file('feature_extractors') source_table = open('feature_extractors/source.sql', 'r').read() sink_table = open('feature_extractors/sink.sql', 'r').read() self.table_env.execute_sql(source_table) self.table_env.execute_sql(sink_table)