def select_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_select_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source("Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP()])) bt_env.register_table_sink("result", CsvTableSink(["a", "c"], [DataTypes.STRING(), DataTypes.INT()], result_file)) orders = bt_env.scan("Orders") result = orders.select("a, b") result.insert_into("result") bt_env.execute("select batch")
def test_execute(self): tmp_dir = tempfile.gettempdir() field_names = ['a', 'b', 'c'] field_types = [ DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING() ] t_env = BatchTableEnvironment.create(self.env) t_env.register_table_sink( 'Results', CsvTableSink( field_names, field_types, os.path.join('{}/{}.csv'.format(tmp_dir, round(time.time()))))) t_env.insert_into( 'Results', t_env.from_elements([(1, 'Hi', 'Hello')], ['a', 'b', 'c'])) execution_result = t_env.execute('test_batch_execute') self.assertIsNotNone(execution_result.get_job_id()) self.assertIsNotNone(execution_result.get_net_runtime()) self.assertEqual(len(execution_result.get_all_accumulator_results()), 0) self.assertIsNone( execution_result.get_accumulator_result('accumulator')) self.assertIsNotNone(str(execution_result))
def minus_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) result_file = "/tmp/table_minus_batch.csv" if os.path.exists(result_file): os.remove(result_file) left = bt_env.from_elements([(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (1, "ra", "raa")], ["a", "b", "c"]).select("a, b, c") right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")], ["a", "b", "c"]).select("a, b, c") bt_env.register_table_sink( "result", CsvTableSink( ["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file)) result = left.minus(right) result.insert_into("result") bt_env.execute("minus batch")
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) # used to test pipeline.jars and pipleline.classpaths config_key = sys.argv[1] config_value = sys.argv[2] t_env.get_config().get_configuration().set_string(config_key, config_value) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT, `count_java` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.sql_update(sink_ddl) t_env.sql_update( "create temporary system function add_one as 'add_one.add_one' language python" ) t_env.register_java_function("add_one_java", "org.apache.flink.python.tests.util.AddOne") elements = [(word, 0) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .select("word, add_one(count) as count, add_one_java(count) as count_java") \ .group_by("word") \ .select("word, count(count) as count, count(count_java) as count_java") \ .insert_into("Results") t_env.execute("word_count")
def inner_join(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) result_file = "/tmp/table_inner_join.csv" if os.path.exists(result_file): os.remove(result_file) left = bt_env.from_elements( [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], ["a", "b", "c"]).select("a, b, c") right = bt_env.from_elements([ (1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], ["d", "e", "f"]).select("d, e, f") bt_env.register_table_sink("result", CsvTableSink(["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file)) result = left.join(right).where("a = d").select("a, b, e") result.insert_into("result") bt_env.execute("inner join") with open(result_file, 'r') as f: print(f.read())
def group_by_agg(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_group_by_agg.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source("Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP()])) bt_env.register_table_sink("result", CsvTableSink(["a", "b"], [DataTypes.STRING(), DataTypes.INT()], result_file)) orders = bt_env.scan("Orders") result = orders.group_by("a").select("a, b.sum as d") result.insert_into("result") bt_env.execute("group by agg") with open(result_file, 'r') as f: print(f.read())
def scalar_func_python_sql(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_table = bt_env.from_elements([("a", 1), ("b", 2), ("c", 3)], ["a", "b"]).select("a, b") result_file = "/tmp/scalar_func_python_sql.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_sink( "result", CsvTableSink(["a", "b"], [DataTypes.STRING(), DataTypes.INT()], result_file)) # register the java scalar function bt_env.register_java_function("hashCode", "com.pyflink.table.HashCode") # register the table for using in the sql query bt_env.register_table("MyTable", source_table) result = bt_env.sql_query("SELECT a, hashCode(a) FROM MyTable") result.insert_into("result") bt_env.execute("scalar func python sql")
def group_by_window_agg_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_group_by_window_agg_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a", "start", "end", "rowtime", "d"], [ DataTypes.STRING(), DataTypes.TIMESTAMP(), DataTypes.TIMESTAMP(), DataTypes.TIMESTAMP(), DataTypes.INT() ], result_file)) orders = bt_env.scan("Orders") result = orders.window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("a, w") \ .select("a, w.start, w.end, w.rowtime, b.sum as d") result.insert_into("result") bt_env.execute("group by agg batch")
def _local_execute_func(exec_func, write_func, pickle_func, python_path): table_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).use_blink_planner().in_batch_mode().build()) table_env.get_config().get_configuration().set_string( 'parallelism.default', '1') table_env.get_config().set_python_executable(python_path) table_env.register_function( exec_func, udf(lambda _: pickle_func, DataTypes.BIGINT(), DataTypes.STRING())) table_env.connect(FileSystem().path(write_func)) \ .with_format(OldCsv().field('func', DataTypes.STRING())) \ .with_schema(Schema().field('func', DataTypes.STRING())) \ .create_temporary_table(exec_func) table = table_env.from_elements([(1, 'Joblib')]) table.select('{}(_1)'.format(exec_func)).insert_into(exec_func) table_env.execute(exec_func) # decode execution result from table sink file. execute_result = cloudpickle.loads( codecs.decode( pd.DataFrame(pd.read_csv(write_func))[0:].columns[0].encode(), 'base64')) # remove table sink file to clear ineffective files. os.remove(write_func) return execute_result
def create_table_env(self): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) statement_set = t_env.create_statement_set() return exec_env, t_env, statement_set
def union(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) result_file = os.getcwd() + "/tmp/table_union_batch.csv" if os.path.exists(result_file): os.remove(result_file) left = bt_env.from_elements([(1, "1b", "1bb"), (2, "2a", "2aa"), (3, None, "3aa"), (1, "1a", "1laa"), (1, "1b", "1bb")], ["a", "b", "c"]).select("a, b, c") right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], ["a", "b", "c"]).select("a, b, c") bt_env.register_table_sink( "result", CsvTableSink( ["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file)) result = left.union(right) #result = left.union_all(right) result.insert_into("result") bt_env.execute("union") with open(result_file, 'r') as f: print(f.read())
def test_custom_env(self): import pyflink from pyflink.dataset import ExecutionEnvironment from pyflink.datastream import StreamExecutionEnvironment benv = ExecutionEnvironment.get_execution_environment() senv = StreamExecutionEnvironment.get_execution_environment() from pyflink.table import BatchTableEnvironment from pyflink.table import StreamTableEnvironment btenv = BatchTableEnvironment.create(benv) stenv = StreamTableEnvironment.create(senv) mlenv = useCustomEnv(pyflink.java_gateway.get_gateway(), benv, btenv, senv, stenv) t = mlenv.btenv.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) source = TableSourceBatchOp(t) source.print() t = mlenv.stenv.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) source = TableSourceStreamOp(t) source.print() StreamOperator.execute() from pyalink.alink import env env._in_custom_env = False resetEnv()
def demo02(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # StreamExecutionEnvironment my_source_ddl = """ create table mySource ( word VARCHAR ) with ( 'connector' = 'filesystem', 'format.type' = 'csv', 'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/input' ) """ my_sink_ddl = """ create table mySink ( word VARCHAR, `count` BIGINT ) with ( 'connector' = 'filesystem', 'format.type' = 'csv', 'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/output' ) """ t_env.execute_sql(my_source_ddl) t_env.execute_sql(my_sink_ddl) tab = t_env.from_path('mySource') tab.group_by(tab.word) \ .select(tab.word, lit(1).count) \ .execute_insert('mySink').wait()
def demo01(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # StreamExecutionEnvironment t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\input')) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') # 文件存在会报错 t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') tab = t_env.from_path('mySource') tab.group_by(tab.word) \ .select(tab.word, lit(1).count) \ .execute_insert('mySink').wait()
def table_func_python_sql_join_lateral_api(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_table = bt_env.from_elements([("a aa aaa", "aa"), ("b bb bbb", "bb"), ("c cc ccc", "cc")], ["a", "b"]).select("a, b") result_file = "/tmp/table_func_python_sql_join_lateral_api.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_sink( "result", CsvTableSink(["a", "b", "c"], [DataTypes.STRING(), DataTypes.STRING(), DataTypes.INT()], result_file)) bt_env.register_java_function("split", "com.pyflink.table.Split") bt_env.register_table("MyTable", source_table) result = bt_env.sql_query( "SELECT a, word, length FROM MyTable, LATERAL TABLE(split(a)) as T(word, length)" ) result.insert_into("result") bt_env.execute("table func python sql join lateral api")
def scalar_func_python_table_api(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_table = bt_env.from_elements([("a", "aa"), ("b", "bb"), ("c", "cc")], ["a", "b"]).select("a, b") result_file = "/tmp/scalar_func_python_table_api.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_sink( "result", CsvTableSink(["a", "b", "c"], [DataTypes.STRING(), DataTypes.INT(), DataTypes.INT()], result_file)) # register the java scalar function bt_env.register_java_function("hashCode", "com.pyflink.table.HashCode") # use the java scalar function in Python Table API result = source_table.select("a, a.hashCode(), hashCode(a)") result.insert_into("result") bt_env.execute("scalar func python table api")
def test_blink_from_element(self): t_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).use_blink_planner().in_batch_mode().build()) field_names = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r" ] field_types = [ DataTypes.BIGINT(), DataTypes.DOUBLE(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.DATE(), DataTypes.TIME(), DataTypes.TIMESTAMP(), DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(), DataTypes.INTERVAL(DataTypes.DAY(), DataTypes.SECOND()), DataTypes.ARRAY(DataTypes.DOUBLE()), DataTypes.ARRAY(DataTypes.DOUBLE(False)), DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.DATE()), DataTypes.DECIMAL(10, 0), DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.DOUBLE()) ]), DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()), DataTypes.BYTES(), PythonOnlyUDT() ] schema = DataTypes.ROW( list( map( lambda field_name, field_type: DataTypes.FIELD( field_name, field_type), field_names, field_types))) table_sink = source_sink_utils.TestAppendSink(field_names, field_types) t_env.register_table_sink("Results", table_sink) t = t_env.from_elements( [(1, 1.0, "hi", "hello", datetime.date(1970, 1, 2), datetime.time(1, 0, 0), datetime.datetime( 1970, 1, 2, 0, 0), datetime.datetime(1970, 1, 2, 0, 0), datetime.timedelta(days=1, microseconds=10), [1.0, None], array.array("d", [1.0, 2.0]), ["abc"], [datetime.date(1970, 1, 2)], Decimal(1), Row("a", "b")(1, 2.0), { "key": 1.0 }, bytearray(b'ABCD'), PythonOnlyPoint(3.0, 4.0))], schema) t.insert_into("Results") t_env.execute("test") actual = source_sink_utils.results() expected = [ '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,' '1970-01-02 00:00:00.0,86400000,[1.0, null],[1.0, 2.0],[abc],[1970-01-02],' '1.000000000000000000,1,2.0,{key=1.0},[65, 66, 67, 68],[3.0, 4.0]' ] self.assert_equals(actual, expected)
def test_construct_with_batch_env(self): execution_environment = ExecutionEnvironment.get_execution_environment() batch_table_environment = BatchTableEnvironment.create(execution_environment) ml_environment = MLEnvironment( exe_env=execution_environment, batch_tab_env=batch_table_environment) self.assertEqual(ml_environment.get_execution_environment(), execution_environment) self.assertEqual(ml_environment.get_batch_table_environment(), batch_table_environment)
def create_env( self) -> (ExecutionEnvironment, TableEnvironment, StatementSet): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) t_env.get_config().get_configuration().set_string( "taskmanager.memory.task.off-heap.size", '80m') statement_set = t_env.create_statement_set() return exec_env, t_env, statement_set
def get_batch_table_environment(self) -> BatchTableEnvironment: """ Get the BatchTableEnvironment. If the BatchTableEnvironment has not been set, it initial the BatchTableEnvironment with default Configuration. :return: the BatchTableEnvironment. """ if self._batch_tab_env is None: self._batch_tab_env = BatchTableEnvironment.create( ExecutionEnvironment.get_execution_environment()) return self._batch_tab_env
def create_table_env(self): exec_env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).in_batch_mode().use_blink_planner().build()) t_env._j_tenv.getPlanner().getExecEnv().setParallelism(1) statement_set = t_env.create_statement_set() t_env.get_config().set_python_executable('/usr/bin/python3') t_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) return exec_env, t_env, statement_set
def word_count(): result = wikipedia.page("New York City") content = result.summary t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) print(add.add(10,5)) print("Word Count"); # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) #sink_ddl = """ # create table Results( # word VARCHAR, # `count` BIGINT # ) with ( # 'connector.type' = 'filesystem', # 'format.type' = 'csv', # 'connector.path' = '{}' # ) # """.format(result_path) t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('Results') #t_env.sql_update(sink_ddl) elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute("word_count")
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() #t_config.set_python_executable("/opt/python38/bin/python3") # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3 t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.execute_sql(sink_ddl) elements = [(word, 1) for word in content.split(" ")] table = t_env.from_elements(elements, ["word", "count"]) table.group_by(table.word) \ .select(table.word, expr.lit(1).count.alias('count')) \ .execute_insert("Results").wait()
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("len", DataTypes.INT()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("len", DataTypes.INT()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") t_env.register_java_function("len", "org.apache.flink.udf.UDFLength") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, len(word), count(1) as count") \ .insert_into("Results") t_env.execute("word_count")
def load(token): # 获取交易日期维度数 pro = ts.pro_api(token) df = pro.query( 'stock_basic', list_status='L', fields= 'ts_code,symbol,name,area,industry,market,curr_type,list_date,is_hs') # 创建flink程序的入口 env_settings = EnvironmentSettings.new_instance().in_batch_mode( ).use_blink_planner().build() table_env = BatchTableEnvironment.create(environment_settings=env_settings) # 将pandas的dataframe转换成 table,并通过创建视图的方式赋予别称 table = table_env.from_pandas(df) table_env.create_temporary_view("stock_info", table) # 声明输出的 sink_ddl = """ -- register a MySQL table 'users' in Flink SQL create table Results( ts_code STRING, symbol STRING, name STRING, area STRING, industry STRING, market STRING, curr_type STRING, list_date STRING, is_hs STRING ) with ( 'connector' = 'jdbc', 'url' = 'jdbc:mysql://localhost:3306/shares?useUnicode=yes&characterEncoding=UTF-8&useSSL=false', 'table-name' = 'dim_stock', 'username' = 'root', 'password' = '123456' ) """ table_env.execute_sql(sink_ddl) # 使用jdbc方式需要额外添加java的jar table_env.get_config().get_configuration().set_string( "pipeline.jars", "file:///home/wy/shares/mysql-connector-java-5.1.49.jar;file:///home/wy/shares/flink-connector-jdbc_2.12-1.12.2.jar" ) # mini模式运行的时候需要调用wait 等待 程序运行完成 table_env.execute_sql( "insert into Results select * from stock_info").wait()
def offset_and_fetch_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) result_file_1 = "/tmp/table_offset_and_fetch_batch_1.csv" result_file_2 = "/tmp/table_offset_and_fetch_batch_2.csv" result_file_3 = "/tmp/table_offset_and_fetch_batch_3.csv" if os.path.exists(result_file_1): os.remove(result_file_1) if os.path.exists(result_file_2): os.remove(result_file_2) if os.path.exists(result_file_3): os.remove(result_file_3) bt_env.register_table_sink("result1", CsvTableSink(["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file_1)) bt_env.register_table_sink("result2", CsvTableSink(["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file_2)) bt_env.register_table_sink("result3", CsvTableSink(["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file_3)) left = bt_env.from_elements( [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")], ["a", "b", "c"]).select("a, b, c") ordered_table = left.order_by("a.asc") ordered_table.fetch(5).insert_into("result1") ordered_table.offset(1).insert_into("result2") ordered_table.offset(1).fetch(2).insert_into("result3") bt_env.execute("offset and fetch batch")
def test_get_execution_plan(self): tmp_dir = tempfile.gettempdir() source_path = os.path.join(tmp_dir + '/streaming.csv') tmp_csv = os.path.join(tmp_dir + '/streaming2.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] t_env = BatchTableEnvironment.create(self.env) csv_source = CsvTableSource(source_path, field_names, field_types) t_env.register_table_source("Orders", csv_source) t_env.register_table_sink( "Results", CsvTableSink(field_names, field_types, tmp_csv)) t_env.scan("Orders").insert_into("Results") plan = self.env.get_execution_plan() json.loads(plan)
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" env_settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build() t_env = BatchTableEnvironment.create(environment_settings=env_settings) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.execute_sql(sink_ddl) elements = [(word, 1) for word in content.split(" ")] table = t_env.from_elements(elements, ["word", "count"]) table.group_by(table.word) \ .select(table.word, expr.lit(1).count.alias('count')) \ .insert_into("Results") t_env.execute("word_count")
def word_count(): f1 = open("/home/mnm/flink-1.9.1/1", "r") f2 = open("/home/mnm/flink-1.9.1/2", "r") f3 = open("/home/mnm/flink-1.9.1/3", "r") f4 = open("/home/mnm/flink-1.9.1/4", "r") f5 = open("/home/mnm/flink-1.9.1/5", "r") content = f1.read() + f2.read() + f3.read() + f4.read() + f5.read() t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute("Python batch word count")
def word_count(): # declare a table environment, set configurations. env = ExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment output_file = os.path.abspath('.') + '/out.txt' if os.path.exists(output_file): try: if os.path.isfile(output_file): os.remove(output_file) except OSError as e: print("Error", e.filename, e.strerror) print("Results:", output_file) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(output_file) t_env.sql_update(sink_ddl) # create the source table with a single string # preforms some transformations, and writes the results to table Results content = "Who's there? I think I hear them. Stand, ho! Who's there?" elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") # execute the Flink Python Table API job t_env.execute("word_count")