def table_func_python_sql_join_lateral_api(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_table = bt_env.from_elements([("a aa aaa", "aa"), ("b bb bbb", "bb"), ("c cc ccc", "cc")], ["a", "b"]).select("a, b") result_file = "/tmp/table_func_python_sql_join_lateral_api.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_sink( "result", CsvTableSink(["a", "b", "c"], [DataTypes.STRING(), DataTypes.STRING(), DataTypes.INT()], result_file)) bt_env.register_java_function("split", "com.pyflink.table.Split") bt_env.register_table("MyTable", source_table) result = bt_env.sql_query( "SELECT a, word, length FROM MyTable, LATERAL TABLE(split(a)) as T(word, length)" ) result.insert_into("result") bt_env.execute("table func python sql join lateral api")
def group_by_agg(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_group_by_agg.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source("Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP()])) bt_env.register_table_sink("result", CsvTableSink(["a", "b"], [DataTypes.STRING(), DataTypes.INT()], result_file)) orders = bt_env.scan("Orders") result = orders.group_by("a").select("a, b.sum as d") result.insert_into("result") bt_env.execute("group by agg") with open(result_file, 'r') as f: print(f.read())
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) # used to test pipeline.jars and pipleline.classpaths config_key = sys.argv[1] config_value = sys.argv[2] t_env.get_config().get_configuration().set_string(config_key, config_value) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT, `count_java` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.sql_update(sink_ddl) t_env.sql_update( "create temporary system function add_one as 'add_one.add_one' language python" ) t_env.register_java_function("add_one_java", "org.apache.flink.python.tests.util.AddOne") elements = [(word, 0) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .select("word, add_one(count) as count, add_one_java(count) as count_java") \ .group_by("word") \ .select("word, count(count) as count, count(count_java) as count_java") \ .insert_into("Results") t_env.execute("word_count")
def union(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) result_file = os.getcwd() + "/tmp/table_union_batch.csv" if os.path.exists(result_file): os.remove(result_file) left = bt_env.from_elements([(1, "1b", "1bb"), (2, "2a", "2aa"), (3, None, "3aa"), (1, "1a", "1laa"), (1, "1b", "1bb")], ["a", "b", "c"]).select("a, b, c") right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], ["a", "b", "c"]).select("a, b, c") bt_env.register_table_sink( "result", CsvTableSink( ["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file)) result = left.union(right) #result = left.union_all(right) result.insert_into("result") bt_env.execute("union") with open(result_file, 'r') as f: print(f.read())
def minus_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) result_file = "/tmp/table_minus_batch.csv" if os.path.exists(result_file): os.remove(result_file) left = bt_env.from_elements([(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (1, "ra", "raa")], ["a", "b", "c"]).select("a, b, c") right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")], ["a", "b", "c"]).select("a, b, c") bt_env.register_table_sink( "result", CsvTableSink( ["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file)) result = left.minus(right) result.insert_into("result") bt_env.execute("minus batch")
def group_by_window_agg_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_group_by_window_agg_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a", "start", "end", "rowtime", "d"], [ DataTypes.STRING(), DataTypes.TIMESTAMP(), DataTypes.TIMESTAMP(), DataTypes.TIMESTAMP(), DataTypes.INT() ], result_file)) orders = bt_env.scan("Orders") result = orders.window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("a, w") \ .select("a, w.start, w.end, w.rowtime, b.sum as d") result.insert_into("result") bt_env.execute("group by agg batch")
def scalar_func_python_table_api(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_table = bt_env.from_elements([("a", "aa"), ("b", "bb"), ("c", "cc")], ["a", "b"]).select("a, b") result_file = "/tmp/scalar_func_python_table_api.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_sink( "result", CsvTableSink(["a", "b", "c"], [DataTypes.STRING(), DataTypes.INT(), DataTypes.INT()], result_file)) # register the java scalar function bt_env.register_java_function("hashCode", "com.pyflink.table.HashCode") # use the java scalar function in Python Table API result = source_table.select("a, a.hashCode(), hashCode(a)") result.insert_into("result") bt_env.execute("scalar func python table api")
def inner_join(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) result_file = "/tmp/table_inner_join.csv" if os.path.exists(result_file): os.remove(result_file) left = bt_env.from_elements( [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], ["a", "b", "c"]).select("a, b, c") right = bt_env.from_elements([ (1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], ["d", "e", "f"]).select("d, e, f") bt_env.register_table_sink("result", CsvTableSink(["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file)) result = left.join(right).where("a = d").select("a, b, e") result.insert_into("result") bt_env.execute("inner join") with open(result_file, 'r') as f: print(f.read())
def test_custom_env(self): import pyflink from pyflink.dataset import ExecutionEnvironment from pyflink.datastream import StreamExecutionEnvironment benv = ExecutionEnvironment.get_execution_environment() senv = StreamExecutionEnvironment.get_execution_environment() from pyflink.table import BatchTableEnvironment from pyflink.table import StreamTableEnvironment btenv = BatchTableEnvironment.create(benv) stenv = StreamTableEnvironment.create(senv) mlenv = useCustomEnv(pyflink.java_gateway.get_gateway(), benv, btenv, senv, stenv) t = mlenv.btenv.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) source = TableSourceBatchOp(t) source.print() t = mlenv.stenv.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) source = TableSourceStreamOp(t) source.print() StreamOperator.execute() from pyalink.alink import env env._in_custom_env = False resetEnv()
def create_table_env(self): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) statement_set = t_env.create_statement_set() return exec_env, t_env, statement_set
def demo02(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # StreamExecutionEnvironment my_source_ddl = """ create table mySource ( word VARCHAR ) with ( 'connector' = 'filesystem', 'format.type' = 'csv', 'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/input' ) """ my_sink_ddl = """ create table mySink ( word VARCHAR, `count` BIGINT ) with ( 'connector' = 'filesystem', 'format.type' = 'csv', 'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/output' ) """ t_env.execute_sql(my_source_ddl) t_env.execute_sql(my_sink_ddl) tab = t_env.from_path('mySource') tab.group_by(tab.word) \ .select(tab.word, lit(1).count) \ .execute_insert('mySink').wait()
def demo01(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # StreamExecutionEnvironment t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\input')) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') # 文件存在会报错 t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') tab = t_env.from_path('mySource') tab.group_by(tab.word) \ .select(tab.word, lit(1).count) \ .execute_insert('mySink').wait()
def scalar_func_python_sql(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_table = bt_env.from_elements([("a", 1), ("b", 2), ("c", 3)], ["a", "b"]).select("a, b") result_file = "/tmp/scalar_func_python_sql.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_sink( "result", CsvTableSink(["a", "b"], [DataTypes.STRING(), DataTypes.INT()], result_file)) # register the java scalar function bt_env.register_java_function("hashCode", "com.pyflink.table.HashCode") # register the table for using in the sql query bt_env.register_table("MyTable", source_table) result = bt_env.sql_query("SELECT a, hashCode(a) FROM MyTable") result.insert_into("result") bt_env.execute("scalar func python sql")
def select_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_select_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source("Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP()])) bt_env.register_table_sink("result", CsvTableSink(["a", "c"], [DataTypes.STRING(), DataTypes.INT()], result_file)) orders = bt_env.scan("Orders") result = orders.select("a, b") result.insert_into("result") bt_env.execute("select batch")
def test_construct_with_batch_env(self): execution_environment = ExecutionEnvironment.get_execution_environment() batch_table_environment = BatchTableEnvironment.create(execution_environment) ml_environment = MLEnvironment( exe_env=execution_environment, batch_tab_env=batch_table_environment) self.assertEqual(ml_environment.get_execution_environment(), execution_environment) self.assertEqual(ml_environment.get_batch_table_environment(), batch_table_environment)
def get_execution_environment(self) -> ExecutionEnvironment: """ Get the ExecutionEnvironment. If the ExecutionEnvironment has not been set, it initial the ExecutionEnvironment with default Configuration. :return: the batch ExecutionEnvironment. """ if self._exe_env is None: self._exe_env = ExecutionEnvironment.get_execution_environment() return self._exe_env
def create_env( self) -> (ExecutionEnvironment, TableEnvironment, StatementSet): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) t_env.get_config().get_configuration().set_string( "taskmanager.memory.task.off-heap.size", '80m') statement_set = t_env.create_statement_set() return exec_env, t_env, statement_set
def get_batch_table_environment(self) -> BatchTableEnvironment: """ Get the BatchTableEnvironment. If the BatchTableEnvironment has not been set, it initial the BatchTableEnvironment with default Configuration. :return: the BatchTableEnvironment. """ if self._batch_tab_env is None: self._batch_tab_env = BatchTableEnvironment.create( ExecutionEnvironment.get_execution_environment()) return self._batch_tab_env
def create_table_env(self): exec_env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).in_batch_mode().use_blink_planner().build()) t_env._j_tenv.getPlanner().getExecEnv().setParallelism(1) statement_set = t_env.create_statement_set() t_env.get_config().set_python_executable('/usr/bin/python3') t_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) return exec_env, t_env, statement_set
def test_equals_and_hash(self): config1 = ExecutionEnvironment.get_execution_environment().get_config() config2 = ExecutionEnvironment.get_execution_environment().get_config() self.assertEqual(config1, config2) self.assertEqual(hash(config1), hash(config2)) config1.set_parallelism(12) self.assertNotEqual(config1, config2) self.assertNotEqual(hash(config1), hash(config2)) config2.set_parallelism(12) self.assertEqual(config1, config2) self.assertEqual(hash(config1), hash(config2))
def word_count(): result = wikipedia.page("New York City") content = result.summary t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) print(add.add(10,5)) print("Word Count"); # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) #sink_ddl = """ # create table Results( # word VARCHAR, # `count` BIGINT # ) with ( # 'connector.type' = 'filesystem', # 'format.type' = 'csv', # 'connector.path' = '{}' # ) # """.format(result_path) t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('Results') #t_env.sql_update(sink_ddl) elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute("word_count")
def usePyFlinkEnv(parallelism: int = None, flinkHome: str = None) -> MLEnv: global _mlenv if in_custom_env(): print( "Warning: usePyFlinkEnv will do nothing, since useCustomEnv is used to initialize MLEnv." ) return _mlenv resetEnv() if flinkHome is not None: g_config["flink_home"] = flinkHome # Let PyFlink to launch gateway, and warn users to add jars to pyflink lib path print( "Warning: You're running the script with 'getMLEnv'. " "You have to manually add Alink jars to PyFlink lib path to make the script work." ) import pyflink # noinspection PyUnresolvedReferences gateway = pyflink.java_gateway.get_gateway() # noinspection PyUnresolvedReferences pyflink.java_gateway.import_flink_view(gateway) # In PyFlink 1.9 and 1.10, PyFlink doesn't start callback server. # We start callback server manually. success = gateway.start_callback_server( callback_server_parameters=CallbackServerParameters( port=0, daemonize=True, daemonize_connections=True)) if success: callback_server_port = gateway.get_callback_server( ).get_listening_port() gateway.java_gateway_server.resetCallbackClient( gateway.java_gateway_server.getCallbackClient().getAddress(), callback_server_port) set_java_gateway(gateway) from pyflink.dataset import ExecutionEnvironment from pyflink.datastream import StreamExecutionEnvironment benv = ExecutionEnvironment.get_execution_environment() senv = StreamExecutionEnvironment.get_execution_environment() if parallelism is not None: benv.set_parallelism(parallelism) senv.set_parallelism(parallelism) # noinspection PyProtectedMember _mlenv = setup_py_ml_env(gateway, benv._j_execution_environment, senv._j_stream_execution_environment) return _mlenv
def test_create_table_environment(self): table_config = TableConfig() table_config.set_max_generated_code_length(32000) table_config.set_null_check(False) table_config.set_local_timezone("Asia/Shanghai") env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, table_config) readed_table_config = t_env.get_config() self.assertFalse(readed_table_config.get_null_check()) self.assertEqual(readed_table_config.get_max_generated_code_length(), 32000) self.assertEqual(readed_table_config.get_local_timezone(), "Asia/Shanghai")
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("len", DataTypes.INT()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("len", DataTypes.INT()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") t_env.register_java_function("len", "org.apache.flink.udf.UDFLength") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, len(word), count(1) as count") \ .insert_into("Results") t_env.execute("word_count")
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() #t_config.set_python_executable("/opt/python38/bin/python3") # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3 t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.execute_sql(sink_ddl) elements = [(word, 1) for word in content.split(" ")] table = t_env.from_elements(elements, ["word", "count"]) table.group_by(table.word) \ .select(table.word, expr.lit(1).count.alias('count')) \ .execute_insert("Results").wait()
def offset_and_fetch_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) result_file_1 = "/tmp/table_offset_and_fetch_batch_1.csv" result_file_2 = "/tmp/table_offset_and_fetch_batch_2.csv" result_file_3 = "/tmp/table_offset_and_fetch_batch_3.csv" if os.path.exists(result_file_1): os.remove(result_file_1) if os.path.exists(result_file_2): os.remove(result_file_2) if os.path.exists(result_file_3): os.remove(result_file_3) bt_env.register_table_sink("result1", CsvTableSink(["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file_1)) bt_env.register_table_sink("result2", CsvTableSink(["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file_2)) bt_env.register_table_sink("result3", CsvTableSink(["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file_3)) left = bt_env.from_elements( [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")], ["a", "b", "c"]).select("a, b, c") ordered_table = left.order_by("a.asc") ordered_table.fetch(5).insert_into("result1") ordered_table.offset(1).insert_into("result2") ordered_table.offset(1).fetch(2).insert_into("result3") bt_env.execute("offset and fetch batch")
def word_count(): f1 = open("/home/mnm/flink-1.9.1/1", "r") f2 = open("/home/mnm/flink-1.9.1/2", "r") f3 = open("/home/mnm/flink-1.9.1/3", "r") f4 = open("/home/mnm/flink-1.9.1/4", "r") f5 = open("/home/mnm/flink-1.9.1/5", "r") content = f1.read() + f2.read() + f3.read() + f4.read() + f5.read() t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute("Python batch word count")
def word_count(): # declare a table environment, set configurations. env = ExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment output_file = os.path.abspath('.') + '/out.txt' if os.path.exists(output_file): try: if os.path.isfile(output_file): os.remove(output_file) except OSError as e: print("Error", e.filename, e.strerror) print("Results:", output_file) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(output_file) t_env.sql_update(sink_ddl) # create the source table with a single string # preforms some transformations, and writes the results to table Results content = "Who's there? I think I hear them. Stand, ho! Who's there?" elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") # execute the Flink Python Table API job t_env.execute("word_count")
def aggregate_func_python_table_api(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_table = bt_env.from_elements([("a", 1, 1), ("a", 2, 2), ("b", 3, 2), ("a", 5, 2)], ["user", "points", "level"]) result_file = "/tmp/aggregate_func_python_table_api.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_sink( "result", CsvTableSink( ["a", "b"], [DataTypes.STRING(), DataTypes.BIGINT()], result_file)) bt_env.register_java_function("wAvg", "com.pyflink.table.WeightedAvg") result = source_table.group_by("user").select( "user, wAvg(points, level) as avgPoints") result.insert_into("result") bt_env.execute("aggregate func python table api")
class main(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # t_env.connect(FileSystem().path('./temp/deviceorientation')) \ # .with_format(OldCsv() # .field('word', DataTypes.STRING())) \ # .with_schema(Schema() # .field('word', DataTypes.STRING())) \ # .create_temporary_table('mySource') my_source_ddl = """ create table mySource ( word VARCHAR ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = './temp/input' ) """ t_env.sql_update(my_source_ddl) t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') t_env.from_path('mySource') \ .group_by('word') \ .select('word, count(1)') \ .insert_into('mySink') t_env.execute("tutorial_job")