def create_table_env(self): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) statement_set = t_env.create_statement_set() return exec_env, t_env, statement_set
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) # used to test pipeline.jars and pipleline.classpaths config_key = sys.argv[1] config_value = sys.argv[2] t_env.get_config().get_configuration().set_string(config_key, config_value) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT, `count_java` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.sql_update(sink_ddl) t_env.sql_update( "create temporary system function add_one as 'add_one.add_one' language python" ) t_env.register_java_function("add_one_java", "org.apache.flink.python.tests.util.AddOne") elements = [(word, 0) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .select("word, add_one(count) as count, add_one_java(count) as count_java") \ .group_by("word") \ .select("word, count(count) as count, count(count_java) as count_java") \ .insert_into("Results") t_env.execute("word_count")
def __init__(self, parallelism: int = 1, checkpoint_interval: Optional[int] = None, state_ttl: Optional[int] = None) -> None: # setting env env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(parallelism) if checkpoint_interval: env.set_state_backend( RocksDBStateBackend(self.checkpoints_path, enable_incremental_checkpointing=True)) env.enable_checkpointing(checkpoint_interval * 1000) t_config = TableConfig() if state_ttl: t_config.set_idle_state_retention_time( timedelta(seconds=state_ttl), timedelta(seconds=state_ttl + 300)) table_env = StreamTableEnvironment.create(env, table_config=t_config) table_env.get_config().get_configuration().set_string( "pipeline.jars", self.flink_sql_connector_kafka_jar) # set up table for ddl in self.tables: table_env.execute_sql(ddl) self.env = env self.table_env = table_env
def setUp(self): super(PyFlinkBatchTableTestCase, self).setUp() self.env = ExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.t_env = BatchTableEnvironment.create(self.env, TableConfig()) self.t_env.get_config().get_configuration().set_string( "taskmanager.memory.task.off-heap.size", "80mb")
def demo02(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # StreamExecutionEnvironment my_source_ddl = """ create table mySource ( word VARCHAR ) with ( 'connector' = 'filesystem', 'format.type' = 'csv', 'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/input' ) """ my_sink_ddl = """ create table mySink ( word VARCHAR, `count` BIGINT ) with ( 'connector' = 'filesystem', 'format.type' = 'csv', 'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/output' ) """ t_env.execute_sql(my_source_ddl) t_env.execute_sql(my_sink_ddl) tab = t_env.from_path('mySource') tab.group_by(tab.word) \ .select(tab.word, lit(1).count) \ .execute_insert('mySink').wait()
def setUp(self): super(PyFlinkOldBatchTableTestCase, self).setUp() self.env = ExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.t_env = BatchTableEnvironment.create(self.env, TableConfig()) self.t_env.get_config().get_configuration().set_string( "python.fn-execution.bundle.size", "1")
def demo01(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # StreamExecutionEnvironment t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\input')) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') # 文件存在会报错 t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') tab = t_env.from_path('mySource') tab.group_by(tab.word) \ .select(tab.word, lit(1).count) \ .execute_insert('mySink').wait()
def create_env( self) -> (ExecutionEnvironment, TableEnvironment, StatementSet): exec_env = StreamExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = StreamTableEnvironment.create(exec_env, t_config) t_env.get_config().get_configuration().set_string( "taskmanager.memory.task.off-heap.size", '80m') statement_set = t_env.create_statement_set() return exec_env, t_env, statement_set
def word_count(): result = wikipedia.page("New York City") content = result.summary t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) print(add.add(10,5)) print("Word Count"); # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) #sink_ddl = """ # create table Results( # word VARCHAR, # `count` BIGINT # ) with ( # 'connector.type' = 'filesystem', # 'format.type' = 'csv', # 'connector.path' = '{}' # ) # """.format(result_path) t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('Results') #t_env.sql_update(sink_ddl) elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute("word_count")
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() #t_config.set_python_executable("/opt/python38/bin/python3") # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3 t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.execute_sql(sink_ddl) elements = [(word, 1) for word in content.split(" ")] table = t_env.from_elements(elements, ["word", "count"]) table.group_by(table.word) \ .select(table.word, expr.lit(1).count.alias('count')) \ .execute_insert("Results").wait()
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("len", DataTypes.INT()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("len", DataTypes.INT()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") t_env.register_java_function("len", "org.apache.flink.udf.UDFLength") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, len(word), count(1) as count") \ .insert_into("Results") t_env.execute("word_count")
def word_count(): f1 = open("/home/mnm/flink-1.9.1/1", "r") f2 = open("/home/mnm/flink-1.9.1/2", "r") f3 = open("/home/mnm/flink-1.9.1/3", "r") f4 = open("/home/mnm/flink-1.9.1/4", "r") f5 = open("/home/mnm/flink-1.9.1/5", "r") content = f1.read() + f2.read() + f3.read() + f4.read() + f5.read() t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute("Python batch word count")
def word_count(): # declare a table environment, set configurations. env = ExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment output_file = os.path.abspath('.') + '/out.txt' if os.path.exists(output_file): try: if os.path.isfile(output_file): os.remove(output_file) except OSError as e: print("Error", e.filename, e.strerror) print("Results:", output_file) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(output_file) t_env.sql_update(sink_ddl) # create the source table with a single string # preforms some transformations, and writes the results to table Results content = "Who's there? I think I hear them. Stand, ho! Who's there?" elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") # execute the Flink Python Table API job t_env.execute("word_count")
class main(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # t_env.connect(FileSystem().path('./temp/deviceorientation')) \ # .with_format(OldCsv() # .field('word', DataTypes.STRING())) \ # .with_schema(Schema() # .field('word', DataTypes.STRING())) \ # .create_temporary_table('mySource') my_source_ddl = """ create table mySource ( word VARCHAR ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = './temp/input' ) """ t_env.sql_update(my_source_ddl) t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') t_env.from_path('mySource') \ .group_by('word') \ .select('word, count(1)') \ .insert_into('mySink') t_env.execute("tutorial_job")
def run(self): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = StreamTableEnvironment.create(exec_env, t_config) t_env.connect(FileSystem().path('/tmp/input')) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') model = Model.fromFile('./../batch_ml/model.pmml') t_env.from_path('mySource') \ .group_by('word') \ .select('word, count(1)') \ .insert_into('mySink') t_env.execute("tutorial_job") self.read_data() result = model.predict({ "Sepal_Length": 5.1, "Sepal_Width": 3.5, "Petal_Length": 1.4, "Petal_Width": 0.2 })
def setUp(self): super(PyFlinkBatchTableTestCase, self).setUp() self.env = ExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.t_env = BatchTableEnvironment.create(self.env, TableConfig())
from pyflink.dataset import ExecutionEnvironment from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment from pyflink.table.descriptors import Schema, OldCsv, FileSystem exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(2) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) t_env.connect(FileSystem().path('input')) \ .with_format(OldCsv() .line_delimiter(' ') .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .register_table_source("inputSource") t_env.connect(FileSystem().path('output')) \ .with_format(OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\ .with_schema(Schema().field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\ .register_table_sink('sink') t_env.scan('inputSource').group_by('word').select('word, count(1)').insert_into('sink') t_env.execute('my first job')
def test_set_get_python_executable(self): table_config = TableConfig() table_config.set_python_executable("/usr/bin/python3") self.assertEqual("/usr/bin/python3", table_config.get_python_executable())
def flink_init_batch_env(self, line): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() self.t_env = BatchTableEnvironment.create(exec_env, t_config)
from pyflink.dataset import ExecutionEnvironment from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment from pyflink.table.descriptors import Schema, OldCsv, FileSystem EXEC_ENV = ExecutionEnvironment.get_execution_environment() EXEC_ENV.set_parallelism(1) T_CONFIG = TableConfig() T_ENV = BatchTableEnvironment.create(EXEC_ENV, T_CONFIG) T_ENV.connect(FileSystem().path('/tmp/input')) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') T_ENV.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') T_ENV.from_path('mySource') \ .group_by('word') \ .select('word, count(1)') \ .insert_into('mySink')