def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode()) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.execute_sql(sink_ddl) elements = [(word, 1) for word in content.split(" ")] table = t_env.from_elements(elements, ["word", "count"]) table.group_by(table.word) \ .select(table.word, expr.lit(1).count.alias('count')) \ .execute_insert("Results")
def pandas_udf_demo(): # Init environment environment_settings = EnvironmentSettings.new_instance( ).use_blink_planner().in_batch_mode().build() t_env = TableEnvironment.create(environment_settings=environment_settings) t_env.get_config().get_configuration().set_string('parallelism.default', '1') # Define and register UDF add = udf(lambda i, j: i + j, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT(), udf_type="pandas") # Register Source t_env.execute_sql(""" CREATE TABLE mySource ( a BIGINT, b BIGINT ) WITH ( 'connector' = 'filesystem', 'format' = 'csv', 'path' = '/opt/examples/table/input/udf_add_input' ) """) # Register Sink t_env.execute_sql(""" CREATE TABLE mySink ( `sum` BIGINT ) WITH ( 'connector' = 'filesystem', 'format' = 'csv', 'path' = '/opt/examples/table/output/pandas_udf_add_output' ) """) # Query t_env.from_path('mySource')\ .select(add(col('a'), col('b'))) \ .insert_into('mySink') t_env.execute("5-pandas_udf_add")
def test_with_built_in_database_name(self): gateway = get_gateway() DEFAULT_BUILTIN_DATABASE = gateway.jvm.EnvironmentSettings.DEFAULT_BUILTIN_DATABASE builder = EnvironmentSettings.new_instance() # test the default behaviour to make sure it is consistent with the python doc environment_settings = builder.build() self.assertEqual(environment_settings.get_built_in_database_name(), DEFAULT_BUILTIN_DATABASE) environment_settings = builder.with_built_in_database_name( "my_database").build() self.assertEqual(environment_settings.get_built_in_database_name(), "my_database")
def test_to_append_stream(self): self.env.set_parallelism(1) t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.in_streaming_mode()) table = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hi")], ["a", "b", "c"]) new_table = table.select(table.a + 1, table.b + 'flink', table.c) ds = t_env.to_append_stream( table=new_table, type_info=Types.ROW([Types.LONG(), Types.STRING(), Types.STRING()])) test_sink = DataStreamTestSinkFunction() ds.add_sink(test_sink) self.env.execute("test_to_append_stream") result = test_sink.get_results(False) expected = ['+I[2, Hiflink, Hello]', '+I[3, Helloflink, Hi]'] self.assertEqual(result, expected)
def load(token, day): # 获取交易日期维度数据 pro = ts.pro_api(token) df = pro.query('stock_basic', ts_code="000001.SZ", list_status='L', fields='ts_code,symbol,name,area,industry,market,curr_type,list_date,is_hs') # 创建flink程序的入口 env_settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build() table_env = BatchTableEnvironment.create(environment_settings=env_settings) # 将pandas的dataframe转换成 table,并通过创建视图的方式赋予别称 table = table_env.from_pandas(df) table_env.create_temporary_view("stock_info", table) # 声明输出的 sink_ddl = """ -- register a MySQL table 'users' in Flink SQL create table Results( ts_code STRING, symbol STRING, name STRING, area STRING, industry STRING, market STRING, curr_type STRING, list_date STRING, is_hs STRING ) with ( 'connector' = 'jdbc', 'url' = 'jdbc:mysql://localhost:3306/shares', 'table-name' = 'dim_stock', 'username' = 'root', 'password' = '123456' ) """ table_env.execute_sql(sink_ddl) # 使用jdbc方式需要额外添加java的jar table_env.get_config().get_configuration().set_string("pipeline.jars", "file:///home/wy/shares/mysql-connector-java-5.1.49.jar;file:///home/wy/shares/flink-connector-jdbc_2.12-1.12.2.jar") # mini模式运行的时候需要调用wait 等待 程序运行完成 table_env.execute_sql("insert into Results select * from stock_info").wait()
def main(args): func = args[1] version = args[2] index_name = '_'.join(["performance_pyflink", func, version]) env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build() t_env = StreamTableEnvironment.create(env, environment_settings=environment_settings) t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.size", 300000) t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.time", 1000) t_env.get_config().get_configuration().set_boolean("pipeline.object-reuse", True) t_env.get_config().get_configuration().set_boolean("python.fn-execution.memory.managed", True) # t_env.register_table_sink( # "sink", # PrintTableSink( # ["id"], # [DataTypes.INT(False)])) @udf(input_types=[DataTypes.INT(False)], result_type=DataTypes.INT(False)) def inc(x): return x + 1 t_env.register_function("inc", inc) t_env.register_java_function("java_inc", "com.alibaba.flink.function.JavaInc") register_source(t_env) register_sink(t_env, index_name) source = t_env.from_path("source") if func == 'java': table = source.select("java_inc(a) as a") else: table = source.select("inc(a) as a") table.filter("a % 1000000 = 0") \ .insert_into("sink") beg_time = time.time() t_env.execute("Python UDF") print("PyFlink Python UDF inc() consume time: " + str(time.time() - beg_time))
def test_to_append_stream(self): self.env.set_parallelism(1) t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance( ).use_blink_planner().build()) table = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hi")], ["a", "b", "c"]) new_table = table.select("a + 1, b + 'flink', c") ds = t_env.to_append_stream( table=new_table, type_info=Types.ROW([Types.LONG(), Types.STRING(), Types.STRING()])) test_sink = DataStreamTestSinkFunction() ds.add_sink(test_sink) self.env.execute("test_to_append_stream") result = test_sink.get_results(False) expected = ['2,Hiflink,Hello', '3,Helloflink,Hi'] self.assertEqual(result, expected)
def main(): table_env = StreamTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) props = get_application_properties() input_property_map = property_map(props, "consumer.config.0") output_property_map = property_map(props, "sink.config.0") input_table_name = "input_table" output_table_name = "output_table" table_env.execute_sql("""CREATE TABLE {0} ( foo BIGINT ) WITH ( 'connector' = 'kinesis', 'stream' = '{1}', 'aws.region' = '{2}', 'scan.stream.initpos' = '{3}', 'format' = 'json', 'json.timestamp-format.standard' = 'ISO-8601' )""".format(input_table_name, input_property_map["input.stream.name"], input_property_map["aws.region"], input_property_map["flink.stream.initpos"])) table_env.execute_sql("""CREATE TABLE {0} ( bar BIGINT ) WITH ( 'connector' = 'filesystem', 'path' = 's3a://{1}/', 'format' = 'csv', 'sink.partition-commit.policy.kind' = 'success-file', 'sink.partition-commit.delay' = '1 min' )""".format(output_table_name, output_property_map["output.bucket.name"])) table_env.execute_sql("""INSERT INTO {0} SELECT foo as bar FROM {1} """.format(output_table_name, input_table_name))
def test_to_retract_stream(self): self.env.set_parallelism(1) t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.in_streaming_mode()) table = t_env.from_elements([(1, "Hi", "Hello"), (1, "Hi", "Hello")], ["a", "b", "c"]) new_table = table.group_by("c").select("a.sum, c as b") ds = t_env.to_retract_stream(table=new_table, type_info=Types.ROW( [Types.LONG(), Types.STRING()])) test_sink = DataStreamTestSinkFunction() ds.map(lambda x: x).add_sink(test_sink) self.env.execute("test_to_retract_stream") result = test_sink.get_results(True) expected = [ "(True, Row(f0=1, f1='Hello'))", "(False, Row(f0=1, f1='Hello'))", "(True, Row(f0=2, f1='Hello'))" ] self.assertEqual(result, expected)
def load(token): # 获取交易日期维度数据 pro = ts.pro_api(token) df = pro.trade_cal(start_date='20210310', end_date='20210410') # 创建flink程序的入口 env_settings = EnvironmentSettings.new_instance().in_batch_mode( ).use_blink_planner().build() table_env = BatchTableEnvironment.create(environment_settings=env_settings) # 将pandas的dataframe转换成 table,并通过创建视图的方式赋予别称 table = table_env.from_pandas(df) table_env.create_temporary_view("tai_date", table) # 声明输出的 sink_ddl = """ -- register a MySQL table 'users' in Flink SQL create table Results( exchange STRING, cal_date STRING, is_open bigint ) with ( 'connector' = 'jdbc', 'url' = 'jdbc:mysql://localhost:3306/shares', 'table-name' = 'dim_date', 'username' = 'root', 'password' = '123456' ) """ table_env.execute_sql(sink_ddl) # 使用jdbc方式需要额外添加java的jar table_env.get_config().get_configuration().set_string( "pipeline.jars", "file:///home/wy/shares/mysql-connector-java-5.1.49.jar;file:///home/wy/shares/flink-connector-jdbc_2.12-1.12.2.jar" ) # mini模式运行的时候需要调用wait 等待 程序运行完成 table_env.execute_sql("insert into Results select * from tai_date").wait()
def log_processing(): env = StreamExecutionEnvironment.get_execution_environment() env_settings = EnvironmentSettings.Builder().use_blink_planner().build() t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=env_settings) # specify connector and format jars t_env.get_config().get_configuration().set_string("pipeline.jars", "file://" + FAT_JAR_PATH) source_ddl = """ CREATE TABLE source_table( a VARCHAR, b INT ) WITH ( 'connector' = 'kafka', 'topic' = 'source_topic', 'properties.bootstrap.servers' = 'localhost:9092', 'properties.group.id' = 'test_group', 'scan.startup.mode' = 'earliest-offset', 'format' = 'json' ) """ sink_ddl = """ CREATE TABLE sink_table( a VARCHAR ) WITH ( 'connector' = 'kafka', 'topic' = 'sink_topic', 'properties.bootstrap.servers' = 'localhost:9092', 'format' = 'json' ) """ t_env.execute_sql(source_ddl) t_env.execute_sql(sink_ddl) t_env.from_path("source_table").select("a").execute_insert( "sink_table").wait()
def log_processing(): env_settings = EnvironmentSettings.new_instance().use_blink_planner( ).is_streaming_mode().build() t_env = TableEnvironment.create(env_settings) # specify connector and format jars t_env.get_config().get_configuration().set_string( "pipeline.jars", "file:///my/jar/path/connector.jar;file:///my/jar/path/json.jar") source_ddl = """ CREATE TABLE source_table( a VARCHAR, b SERIAL ) WITH ( 'connector' = 'kafka', 'topic' = 'topic', 'properties.bootstrap.servers' = 'localhost:9092', 'properties.group.id' = 'testGroup' 'scan.startup.mode' = 'latest-offset', 'format' = 'json' ) """ sink_ddl = """ CREATE TABLE sink_table( a VARCHAR ) WITH ( 'connector' = 'kafka', 'topic' = 'topic2', 'properties.bootstrap.servers' = 'kafka:9092', 'format' = 'json' ) """ t_env.execute_sql(source_ddl) t_env.execute_sql(sink_ddl) t_env.sql_query("SELECT a FROM source_table") \ .execute_insert("sink_table").wait()
def process_json_data_with_udf(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('id', DataTypes.BIGINT()).column( 'data', DataTypes.STRING()).build()).build()) # update json columns @udf(result_type=DataTypes.STRING()) def update_tel(data): json_data = json.loads(data) json_data['tel'] += 1 return json.dumps(json_data) table = table.select(table.id, update_tel(table.data)) # execute table.execute_insert('sink') \ .wait()
def test_explain_with_multi_sinks_with_blink_planner(self): t_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).in_batch_mode().use_blink_planner().build()) source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"]) field_names = ["a", "b", "c"] field_types = [ DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING() ] t_env.register_table_sink( "sink1", CsvTableSink(field_names, field_types, "path1")) t_env.register_table_sink( "sink2", CsvTableSink(field_names, field_types, "path2")) t_env.sql_update("insert into sink1 select * from %s where a > 100" % source) t_env.sql_update("insert into sink2 select * from %s where a < 100" % source) actual = t_env.explain(extended=True) self.assertIsInstance(actual, (str, unicode))
def setup_pyflink_env(gateway: JavaGateway, j_benv: JavaObject, j_senv: JavaObject) \ -> Tuple[ExecutionEnvironment, BatchTableEnvironment, StreamExecutionEnvironment, StreamTableEnvironment]: """ Setup Py4J for PyFlink, and create Python instances environments. """ from pyflink.dataset import ExecutionEnvironment from pyflink.datastream import StreamExecutionEnvironment from pyflink.table import BatchTableEnvironment, StreamTableEnvironment import pyflink # noinspection PyUnresolvedReferences pyflink.java_gateway._gateway = gateway # noinspection PyUnresolvedReferences pyflink.java_gateway.import_flink_view(gateway) benv = ExecutionEnvironment(j_benv) senv = StreamExecutionEnvironment(j_senv) # noinspection PyDeprecation btenv = BatchTableEnvironment.create(benv) # noinspection PyDeprecation stenv_settings = EnvironmentSettings.new_instance().use_old_planner().in_streaming_mode().build() stenv = StreamTableEnvironment.create(senv, environment_settings=stenv_settings) return benv, btenv, senv, stenv
def basic_operations(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) right_table = t_env.from_elements(elements=[(1, 18), (2, 30), (3, 25), (4, 10)], schema=['id', 'age']) table = table.add_columns( col('data').json_value('$.name', DataTypes.STRING()).alias('name'), col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'), col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \ .drop_columns(col('data')) table.execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 3 | world | 124 | USA | # | +I | 4 | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # limit the number of outputs table.limit(3).execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 3 | world | 124 | USA | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # filter table.filter(col('id') != 3).execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 4 | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # aggregation table.group_by(col('country')) \ .select(col('country'), col('id').count, col('tel').cast(DataTypes.BIGINT()).max) \ .execute().print() # +----+--------------------------------+----------------------+----------------------+ # | op | country | EXPR$0 | EXPR$1 | # +----+--------------------------------+----------------------+----------------------+ # | +I | Germany | 1 | 123 | # | +I | USA | 1 | 124 | # | +I | China | 1 | 135 | # | -U | China | 1 | 135 | # | +U | China | 2 | 135 | # +----+--------------------------------+----------------------+----------------------+ # distinct table.select(col('country')).distinct() \ .execute().print() # +----+--------------------------------+ # | op | country | # +----+--------------------------------+ # | +I | Germany | # | +I | China | # | +I | USA | # +----+--------------------------------+ # join # Note that it still doesn't support duplicate column names between the joined tables table.join(right_table.rename_columns(col('id').alias('r_id')), col('id') == col('r_id')) \ .execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+ # | op | id | name | tel | country | r_id | age | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+ # | +I | 4 | PyFlink | 32 | China | 4 | 10 | # | +I | 1 | Flink | 123 | Germany | 1 | 18 | # | +I | 2 | hello | 135 | China | 2 | 30 | # | +I | 3 | world | 124 | USA | 3 | 25 | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+ # join lateral @udtf(result_types=[DataTypes.STRING()]) def split(r: Row): for s in r.name.split("i"): yield s table.join_lateral(split.alias('a')) \ .execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | a | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | Fl | # | +I | 1 | Flink | 123 | Germany | nk | # | +I | 2 | hello | 135 | China | hello | # | +I | 3 | world | 124 | USA | world | # | +I | 4 | PyFlink | 32 | China | PyFl | # | +I | 4 | PyFlink | 32 | China | nk | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # show schema table.print_schema() # ( # `id` BIGINT, # `name` STRING, # `tel` STRING, # `country` STRING # ) # show execute plan print(table.join_lateral(split.alias('a')).explain())
def log_processing(): env_settings = EnvironmentSettings.new_instance().in_streaming_mode( ).use_blink_planner().build() t_env = StreamTableEnvironment.create(environment_settings=env_settings) # specify connector and format jars t_env.get_config().get_configuration().set_string( "pipeline.jars", "file:///Users/liuhongwei/.m2/repository/org/apache/flink/flink-connector-kafka_2.11/1.12.0/flink-connector-kafka_2.11-1.12.0.jar;file:///Users/liuhongwei/.m2/repository/net/sf/json-lib/json-lib/2.3/json-lib-2.3-jdk15.jar;file:///Users/liuhongwei/.m2/repository/org/apache/kafka/kafka-clients/2.4.1/kafka-clients-2.4.1.jar" ) source_ddl = """ CREATE TABLE source_table( token VARCHAR, stime BIGINT, appKey VARCHAR, user_action_time AS PROCTIME() ) WITH ( 'connector' = 'kafka', 'topic' = 'markTopic', 'properties.bootstrap.servers' = 'slavenode164.data.test.ds:9092,slavenode165.data.test.ds:9092,slavenode166.data.test.ds:9092', 'properties.group.id' = 'test_3', 'scan.startup.mode' = 'earliest-offset', 'format' = 'json' ) """ sink_ddl = """ CREATE TABLE sink_table( token VARCHAR, appKey VARCHAR, stime TIMESTAMP(3) NOT NULL, nums BIGINT NOT NULL ) WITH ( 'connector' = 'kafka', 'topic' = 'markTopic1', 'properties.bootstrap.servers' = 'slavenode164.data.test.ds:9092,slavenode165.data.test.ds:9092,slavenode166.data.test.ds:9092', 'format' = 'json' ) """ t_env.execute_sql(source_ddl) t_env.execute_sql(sink_ddl) query_sql = """ SELECT token, appKey, TUMBLE_START(user_action_time, INTERVAL '5' MINUTE) as stime, COUNT(token) as nums FROM source_table WHERE appKey = 'YSHAppAndroidIOSH5' GROUP BY token, appKey, TUMBLE(user_action_time, INTERVAL '5' MINUTE) """ # t_env.sql_query(query_sql) \ # .execute_insert("sink_table").wait() source_t = t_env.from_path("source_table") result = source_t.filter(source_t.appKey == "YSHAppAndroidIOSH5") \ .window(Slide.over(lit(1).days) \ .every(lit(1).minutes) \ .on(source_t.user_action_time).alias("w")) \ .group_by(source_t.token, source_t.appKey, col("w")) \ .select(source_t.token, source_t.appKey, col("w").start.alias("stime"), source_t.token.count.alias("nums")) result.execute_insert("sink_table").wait()
def setUp(self): super(PyFlinkBlinkBatchTableTestCase, self).setUp() self.t_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).in_batch_mode().use_blink_planner().build()) self.t_env._j_tenv.getPlanner().getExecEnv().setParallelism(2)
def kafka_to_mysql(): """ 从Kafka Source读取Json数据,然后导入到Mysql。{"msg": "welcome flink users..."} """ settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build() env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=settings) t_env.get_config().get_configuration().set_boolean("python.fn-execution.memory.managed", True) #JARS_DIR=/Users/jincheng.sunjc/work/PlaygroundEnv/myJars/ #wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/org/apache/flink/flink-json/1.11.1/flink-json-1.11.1.jar; \ #wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka-0.11_2.11/1.11.1/flink-sql-connector-kafka-0.11_2.11-1.11.1.jar; \ #wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-jdbc_2.11/1.11.1/flink-connector-jdbc_2.11-1.11.1.jar; \ #wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/mysql/mysql-connector-java/5.1.40/mysql-connector-java-5.1.40.jar; \ #wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/org/apache/flink/flink-table-common/1.11.1/flink-table-common-1.11.1.jar ; \ # 添加依赖 base_dir = "file:///Users/jincheng.sunjc/work/PlaygroundEnv/myJars/" kafka_jar = f"{base_dir}flink-sql-connector-kafka-0.11_2.11-1.11.1.jar" jdbc_jar = f"{base_dir}flink-connector-jdbc_2.11-1.11.1.jar" mysql_jar = f"{base_dir}mysql-connector-java-5.1.40.jar" json_format_jar = f"{base_dir}flink-json-1.11.1.jar" table_common_jar = f"{base_dir}flink-table-common-1.11.1.jar" jar_seq = [kafka_jar, jdbc_jar, mysql_jar, json_format_jar, table_common_jar] jars = ";".join(jar_seq) t_env.get_config().get_configuration().set_string("pipeline.jars",jars) source_ddl = """ CREATE TABLE kafka_source ( msg STRING ) WITH ( 'connector' = 'kafka-0.11', 'topic' = 'cdn-log', 'properties.bootstrap.servers' = 'localhost:9092', 'format' = 'json', 'scan.startup.mode' = 'latest-offset' ) """ sink_ddl = """ CREATE TABLE mysql_sink ( msg STRING ) WITH ( 'connector' = 'jdbc', 'url' = 'jdbc:mysql://localhost:3306/flinkdb?characterEncoding=utf-8&useSSL=false', 'table-name' = 'cdn_log', 'username' = 'root', 'password' = '123456', 'sink.buffer-flush.max-rows' = '1' ) """ # 注册source和sink t_env.execute_sql(source_ddl) t_env.execute_sql(sink_ddl) # 数据提取 tab = t_env.from_path("kafka_source") # 这里我们暂时先使用 标注了 deprecated 的API, 因为新的异步提交测试有待改进... tab.insert_into("mysql_sink") # 执行作业 t_env.execute("kafka_to_mysql")
def group_by_window_agg_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/table_group_by_window_agg_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("time") .watermarks_periodic_bounded(60000)) .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("source") st_env.register_table_sink( "result", CsvTableSink( ["a", "b"], [DataTypes.STRING(), DataTypes.STRING()], result_file)) st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("w, a") \ .select("a, max(b)").insert_into("result") st_env.execute("group by window agg streaming")
def word_count(): environment_settings = EnvironmentSettings.new_instance().in_batch_mode( ).use_blink_planner().build() t_env = BatchTableEnvironment.create( environment_settings=environment_settings) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) # we should set the Python verison here if `Python` not point t_env.get_config().set_python_executable("python3") t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("city", DataTypes.STRING()) .field("sales_volume", DataTypes.BIGINT()) .field("sales", DataTypes.BIGINT())) \ .with_schema(Schema() .field("city", DataTypes.STRING()) .field("sales_volume", DataTypes.BIGINT()) .field("sales", DataTypes.BIGINT())) \ .register_table_sink("Results") @udf(input_types=DataTypes.STRING(), result_type=DataTypes.ARRAY(DataTypes.STRING())) def split(input_str: str): return input_str.split(",") @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()), DataTypes.INT()], result_type=DataTypes.STRING()) def get(arr, index): return arr[index] t_env.register_function("split", split) t_env.register_function("get", get) t_env.get_config().get_configuration().set_string("parallelism.default", "1") data = [ ("iPhone 11,30,5499,Beijing", ), ("iPhone 11 Pro,20,8699,Guangzhou", ), ("MacBook Pro,10,9999,Beijing", ), ("AirPods Pro,50,1999,Beijing", ), ("MacBook Pro,10,11499,Shanghai", ), ("iPhone 11,30,5999,Shanghai", ), ("iPhone 11 Pro,20,9999,Shenzhen", ), ("MacBook Pro,10,13899,Hangzhou", ), ("iPhone 11,10,6799,Beijing", ), ("MacBook Pro,10,18999,Beijing", ), ("iPhone 11 Pro,10,11799,Shenzhen", ), ("MacBook Pro,10,22199,Shanghai", ), ("AirPods Pro,40,1999,Shanghai", ) ] t_env.from_elements(data, ["line"]) \ .select("split(line) as str_array") \ .select("get(str_array, 3) as city, " "get(str_array, 1).cast(LONG) as count, " "get(str_array, 2).cast(LONG) as unit_price") \ .select("city, count, count * unit_price as total_price") \ .group_by("city") \ .select("city, " "sum(count) as sales_volume, " "sum(total_price) as sales") \ .insert_into("Results") t_env.execute("word_count")
def test(): # 1. create a TableEnvironment #env_settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build() #table_env = StreamTableEnvironment.create(environment_settings=env_settings) env_settings = EnvironmentSettings.new_instance().in_batch_mode( ).use_blink_planner().build() table_env = BatchTableEnvironment.create(environment_settings=env_settings) # 2. create source Table table_env.execute_sql(""" CREATE TABLE source_table ( Region VARCHAR, Country VARCHAR, Item_Type VARCHAR, Sales_Channel VARCHAR, Order_Priority VARCHAR, Order_Date VARCHAR, Order_ID VARCHAR, Ship_Date VARCHAR, Units_Sold VARCHAR, Unit_Price VARCHAR, Unit_Cost VARCHAR, Total_Revenue VARCHAR, Total_Cost VARCHAR, Total_Profit VARCHAR ) WITH ( 'connector' = 'filesystem', 'path' = '/tmp/data/5m_Sales_Records.csv', 'format' = 'csv' ) """) table_env.execute_sql(""" CREATE TABLE sink_table ( Region VARCHAR, Country VARCHAR, Item_Type VARCHAR, Sales_Channel VARCHAR, Order_Priority VARCHAR, Order_Date VARCHAR, Order_ID VARCHAR, Ship_Date VARCHAR, Units_Sold VARCHAR, Unit_Price VARCHAR, Unit_Cost VARCHAR, Total_Revenue VARCHAR, Total_Cost VARCHAR, Total_Profit VARCHAR ) WITH ( 'connector' = 'filesystem', 'path' = '/tmp/data/xxx_Sales_Records.csv', 'format' = 'csv' ) """) @udf(input_types=DataTypes.STRING(), result_type=DataTypes.ARRAY(DataTypes.STRING())) def split(input_str: str): return input_str.split(",") @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()), DataTypes.INT()], result_type=DataTypes.STRING()) def get(arr, index): return arr[index] table_env.register_function("split", split) table_env.register_function("get", get) table_env.sql_query("SELECT * FROM source_table order by Region") \ .execute_insert("sink_table").wait()
def pv_uv_demo(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user_behavior") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " user_id: {" " type: 'string'" " }," " item_id: {" " type: 'string'" " }," " category_id: {" " type: 'string'" " }," " behavior: {" " type: 'string'" " }," " ts: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("user_id", DataTypes.STRING()) .field("item_id", DataTypes.STRING()) .field("category_id", DataTypes.STRING()) .field("behavior", DataTypes.STRING()) .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("ts") .watermarks_periodic_bounded(60000)) ) \ .in_append_mode() \ .register_table_source("source") # use custom retract sink connector custom_connector = CustomConnectorDescriptor('jdbc', 1, False) \ .property("connector.driver", "org.apache.derby.jdbc.ClientDriver") \ .property("connector.url", "jdbc:derby://localhost:1527/firstdb") \ .property("connector.table", "pv_uv_table") \ .property("connector.write.flush.max-rows", "1") st_env.connect(custom_connector) \ .with_schema( Schema() .field("startTime", DataTypes.TIMESTAMP()) .field("endTime", DataTypes.TIMESTAMP()) .field("pv", DataTypes.BIGINT()) .field("uv", DataTypes.BIGINT()) ).register_table_sink("sink") st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("w") \ .select("w.start as startTime, w.end as endTime, COUNT(1) as pv, user_id.count.distinct as uv").insert_into("sink") st_env.execute("table pv uv")
def log_processing(): env = StreamExecutionEnvironment.get_execution_environment() env.set_stream_time_characteristic(TimeCharacteristic.EventTime) env_settings = EnvironmentSettings.Builder().use_blink_planner().build() t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=env_settings) t_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) source_ddl = """ CREATE TABLE payment_msg( createTime VARCHAR, rt as TO_TIMESTAMP(createTime), orderId BIGINT, payAmount DOUBLE, payPlatform INT, provinceId INT, WATERMARK FOR rt as rt - INTERVAL '2' SECOND ) WITH ( 'connector' = 'kafka-0.11', 'topic' = 'payment_msg', 'properties.bootstrap.servers' = 'kafka:9092', 'scan.startup.mode' = 'latest-offset', 'format' = 'json' ) """ es_sink_ddl = """ CREATE TABLE es_sink ( province VARCHAR, pay_amount DOUBLE, rowtime TIMESTAMP(3) ) with ( 'connector.type' = 'elasticsearch', 'connector.version' = '7', 'connector.hosts' = 'http://elasticsearch:9200', 'connector.index' = 'platform_pay_amount_1', 'connector.document-type' = 'payment', 'update-mode' = 'append', 'connector.flush-on-checkpoint' = 'true', 'connector.key-delimiter' = '$', 'connector.key-null-literal' = 'n/a', 'connector.bulk-flush.max-size' = '42mb', 'connector.bulk-flush.max-actions' = '32', 'connector.bulk-flush.interval' = '1000', 'connector.bulk-flush.backoff.delay' = '1000', 'format.type' = 'json' ) """ t_env.sql_update(source_ddl) t_env.sql_update(es_sink_ddl) t_env.register_function('province_id_to_name', province_id_to_name) query = """ select province_id_to_name(provinceId) as province, sum(payAmount) as pay_amount, tumble_start(rt, interval '5' seconds) as rowtime from payment_msg group by tumble(rt, interval '5' seconds), provinceId """ t_env.sql_query(query).insert_into("es_sink") t_env.execute("payment_demo")
def setUp(self): super(StreamDependencyTests, self).setUp() self.st_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) self.st_env._execution_mode = "loopback"
def distinct_agg_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("time") .watermarks_periodic_bounded(60000)) .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("Orders") st_env.connect( Elasticsearch() .version("6") .host("localhost", 9200, "http") .index("distinct_agg_streaming") .document_type('pyflink') .key_delimiter("_") .key_null_literal("null") .failure_handler_ignore() .disable_flush_on_checkpoint() .bulk_flush_max_actions(2) .bulk_flush_max_size("1 mb") .bulk_flush_interval(5000) ) \ .with_schema( Schema() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) ) \ .with_format( Json() .derive_schema() ) \ .in_upsert_mode() \ .register_table_sink("result") orders = st_env.scan("Orders") result = orders.window(Tumble.over("30.minutes").on("rowtime").alias("w")) \ .group_by("a, w").select("a, b.max.distinct as d") result.insert_into("result") st_env.execute("distinct agg streaming")
from pyflink.table import EnvironmentSettings, StreamTableEnvironment # 1. create a TableEnvironment env_settings = EnvironmentSettings.new_instance().in_streaming_mode( ).use_blink_planner().build() table_env = StreamTableEnvironment.create(environment_settings=env_settings) # 2. create source Table table_env.execute_sql(""" CREATE TABLE tweets( id INT, text STRING user_screen_name STRING followers_count INT retweet_count INT favorite_count INT ) WITH ( 'connector' = 'kafka', 'topic' = 'tweets_data', 'properties.bootstrap.servers' = 'localhost:2181', ) """) # 3. create sink Table table_env.execute_sql(""" CREATE TABLE print ( id INT, text STRING user_screen_name STRING followers_count INT retweet_count INT
def column_operations(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) # add columns table = table.add_columns( col('data').json_value('$.name', DataTypes.STRING()).alias('name'), col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'), col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) table.execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | data | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | {"name": "Flink", "tel": 12... | Flink | 123 | Germany | # | +I | 2 | {"name": "hello", "tel": 13... | hello | 135 | China | # | +I | 3 | {"name": "world", "tel": 12... | world | 124 | USA | # | +I | 4 | {"name": "PyFlink", "tel": ... | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # drop columns table = table.drop_columns(col('data')) table.execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 3 | world | 124 | USA | # | +I | 4 | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # rename columns table = table.rename_columns(col('tel').alias('telephone')) table.execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | telephone | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 3 | world | 124 | USA | # | +I | 4 | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # replace columns table = table.add_or_replace_columns( concat(col('id').cast(DataTypes.STRING()), '_', col('name')).alias('id')) table.execute().print()
content = ( 'line Licensed to the Apache Software Foundation ASF under one ' 'line or more contributor license agreements See the NOTICE file ' 'line distributed with this work for additional information ' 'line regarding copyright ownership The ASF licenses this file ' 'to you under the Apache License Version the ' 'License you may not use this file except in compliance ' 'with the License' ) env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create( env, environment_settings = EnvironmentSettings.new_instance() .use_blink_planner() .build(), ) result_path = '/notebooks/output.csv' print('Results directory:', result_path) t_env.connect(FileSystem().path(result_path)).with_format( OldCsv() .field_delimiter(',') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT()) ).with_schema( Schema() .field('word', DataTypes.STRING())
def row_operations(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "China", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) # map operation @udf(result_type=DataTypes.ROW([ DataTypes.FIELD("id", DataTypes.BIGINT()), DataTypes.FIELD("country", DataTypes.STRING()) ])) def extract_country(input_row: Row): data = json.loads(input_row.data) return Row(input_row.id, data['addr']['country']) table.map(extract_country) \ .execute().print() # +----+----------------------+--------------------------------+ # | op | id | country | # +----+----------------------+--------------------------------+ # | +I | 1 | Germany | # | +I | 2 | China | # | +I | 3 | China | # | +I | 4 | China | # +----+----------------------+--------------------------------+ # flat_map operation @udtf(result_types=[DataTypes.BIGINT(), DataTypes.STRING()]) def extract_city(input_row: Row): data = json.loads(input_row.data) yield input_row.id, data['addr']['city'] table.flat_map(extract_city) \ .execute().print() # +----+----------------------+--------------------------------+ # | op | f0 | f1 | # +----+----------------------+--------------------------------+ # | +I | 1 | Berlin | # | +I | 2 | Shanghai | # | +I | 3 | NewYork | # | +I | 4 | Hangzhou | # +----+----------------------+--------------------------------+ # aggregate operation class CountAndSumAggregateFunction(AggregateFunction): def get_value(self, accumulator): return Row(accumulator[0], accumulator[1]) def create_accumulator(self): return Row(0, 0) def accumulate(self, accumulator, input_row): accumulator[0] += 1 accumulator[1] += int(input_row.tel) def retract(self, accumulator, input_row): accumulator[0] -= 1 accumulator[1] -= int(input_row.tel) def merge(self, accumulator, accumulators): for other_acc in accumulators: accumulator[0] += other_acc[0] accumulator[1] += other_acc[1] def get_accumulator_type(self): return DataTypes.ROW([ DataTypes.FIELD("cnt", DataTypes.BIGINT()), DataTypes.FIELD("sum", DataTypes.BIGINT()) ]) def get_result_type(self): return DataTypes.ROW([ DataTypes.FIELD("cnt", DataTypes.BIGINT()), DataTypes.FIELD("sum", DataTypes.BIGINT()) ]) count_sum = udaf(CountAndSumAggregateFunction()) table.add_columns( col('data').json_value('$.name', DataTypes.STRING()).alias('name'), col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'), col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \ .group_by(col('country')) \ .aggregate(count_sum.alias("cnt", "sum")) \ .select(col('country'), col('cnt'), col('sum')) \ .execute().print() # +----+--------------------------------+----------------------+----------------------+ # | op | country | cnt | sum | # +----+--------------------------------+----------------------+----------------------+ # | +I | China | 3 | 291 | # | +I | Germany | 1 | 123 | # +----+--------------------------------+----------------------+----------------------+ # flat_aggregate operation class Top2(TableAggregateFunction): def emit_value(self, accumulator): for v in accumulator: if v: yield Row(v) def create_accumulator(self): return [None, None] def accumulate(self, accumulator, input_row): tel = int(input_row.tel) if accumulator[0] is None or tel > accumulator[0]: accumulator[1] = accumulator[0] accumulator[0] = tel elif accumulator[1] is None or tel > accumulator[1]: accumulator[1] = tel def get_accumulator_type(self): return DataTypes.ARRAY(DataTypes.BIGINT()) def get_result_type(self): return DataTypes.ROW([DataTypes.FIELD("tel", DataTypes.BIGINT())]) top2 = udtaf(Top2()) table.add_columns( col('data').json_value('$.name', DataTypes.STRING()).alias('name'), col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'), col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \ .group_by(col('country')) \ .flat_aggregate(top2) \ .select(col('country'), col('tel')) \ .execute().print()