def main_flink(): #之前的步骤是拿文件然后预处理然后写文件到input这个文件中 env = StreamExecutionEnvironment.get_execution_environment() parr_num = 4 env.set_parallelism(parr_num) t_env = StreamTableEnvironment.create(env) @udf(input_types=DataTypes.STRING(), result_type=DataTypes.STRING()) def cut_extract(string): return cut_posseg.cut_extract(string) t_env.register_function("cut_extract",cut_extract) #这里是建表然后从input拿,问题1:有没有办法从自定义的list来当做输入来节省IO开销呢,比如我输入[文本A,文本B]这样的list作为输入 t_env.connect(FileSystem().path('/home/sjtu/input')) \ .with_format(OldCsv() .field('text', DataTypes.STRING())) \ .with_schema(Schema() .field('text', DataTypes.STRING())) \ .create_temporary_table('mySource') t_env.connect(FileSystem().path('/home/sjtu/output')) \ .with_format(OldCsv() .field('result', DataTypes.STRING())) \ .with_schema(Schema() .field('result', DataTypes.STRING())) \ .create_temporary_table('mySink') t_env.from_path('mySource')\ .select("cut_extract(text)")\ .insert_into('mySink') #问题2:这里我是将结果的表写了文件,但实际上我还需要在这个代码中继续对这些处理完的数据进行处理,也没有办法直接将上述的mySink表直接 #作为内存数据取出来而不是从硬盘再读入呢 t_env.execute("tutorial_job")
def test_temporary_tables(self): t_env = self.t_env t_env.connect(FileSystem().path(os.path.join(self.tempdir + '/temp_1.csv'))) \ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING())) \ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING())) \ .create_temporary_table("temporary_table_1") t_env.connect(FileSystem().path(os.path.join(self.tempdir + '/temp_2.csv'))) \ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING())) \ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING())) \ .create_temporary_table("temporary_table_2") actual = t_env.list_temporary_tables() expected = ['temporary_table_1', 'temporary_table_2'] self.assert_equals(actual, expected) t_env.drop_temporary_table("temporary_table_1") actual = t_env.list_temporary_tables() expected = ['temporary_table_2'] self.assert_equals(actual, expected)
def demo01(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # StreamExecutionEnvironment t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\input')) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') # 文件存在会报错 t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') tab = t_env.from_path('mySource') tab.group_by(tab.word) \ .select(tab.word, lit(1).count) \ .execute_insert('mySink').wait()
def test_field_delimiter(self): csv = OldCsv().field_delimiter("|") properties = csv.to_properties() expected = {'format.field-delimiter': '|', 'format.type': 'csv', 'format.property-version': '1'} self.assertEqual(expected, properties)
def test_ignore_parse_errors(self): csv = OldCsv().ignore_parse_errors() properties = csv.to_properties() expected = {'format.ignore-parse-errors': 'true', 'format.type': 'csv', 'format.property-version': '1'} self.assertEqual(expected, properties)
def test_quote_character(self): csv = OldCsv().quote_character("*") properties = csv.to_properties() expected = {'format.quote-character': '*', 'format.type': 'csv', 'format.property-version': '1'} self.assertEqual(expected, properties)
def test_comment_prefix(self): csv = OldCsv().comment_prefix("#") properties = csv.to_properties() expected = {'format.comment-prefix': '#', 'format.type': 'csv', 'format.property-version': '1'} self.assertEqual(expected, properties)
def test_ignore_first_line(self): csv = OldCsv().ignore_first_line() properties = csv.to_properties() expected = {'format.ignore-first-line': 'true', 'format.type': 'csv', 'format.property-version': '1'} self.assertEqual(expected, properties)
def test_line_delimiter(self): csv = OldCsv().line_delimiter(";") expected = {'format.type': 'csv', 'format.property-version': '1', 'format.line-delimiter': ';'} properties = csv.to_properties() self.assertEqual(expected, properties)
def test_schema(self): csv = OldCsv() schema = TableSchema(["a", "b"], [DataTypes.INT(), DataTypes.STRING()]) csv = csv.schema(schema) properties = csv.to_properties() expected = {'format.fields.0.name': 'a', 'format.fields.0.data-type': 'INT', 'format.fields.1.name': 'b', 'format.fields.1.data-type': 'VARCHAR(2147483647)', 'format.type': 'csv', 'format.property-version': '1'} self.assertEqual(expected, properties)
def _local_execute_func(exec_func, write_func, pickle_func, python_path): table_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).use_blink_planner().in_batch_mode().build()) table_env.get_config().get_configuration().set_string( 'parallelism.default', '1') table_env.get_config().set_python_executable(python_path) table_env.register_function( exec_func, udf(lambda _: pickle_func, DataTypes.BIGINT(), DataTypes.STRING())) table_env.connect(FileSystem().path(write_func)) \ .with_format(OldCsv().field('func', DataTypes.STRING())) \ .with_schema(Schema().field('func', DataTypes.STRING())) \ .create_temporary_table(exec_func) table = table_env.from_elements([(1, 'Joblib')]) table.select('{}(_1)'.format(exec_func)).insert_into(exec_func) table_env.execute(exec_func) # decode execution result from table sink file. execute_result = cloudpickle.loads( codecs.decode( pd.DataFrame(pd.read_csv(write_func))[0:].columns[0].encode(), 'base64')) # remove table sink file to clear ineffective files. os.remove(write_func) return execute_result
def execute(self, function_context: FlinkFunctionContext, input_list: List[Table]) -> List[Table]: t_env = function_context.get_table_env() statement_set = function_context.get_statement_set() table = input_list[0] Popen('rm -rf /root/debug', shell=True) t_env.register_function( "build_index", udf(BuildIndexUDF(self.path, self.element_type, self.dimension), [DataTypes.STRING(), DataTypes.STRING()], DataTypes.STRING())) dummy_output_path = '/tmp/indexed_key' if os.path.exists(dummy_output_path): if os.path.isdir(dummy_output_path): shutil.rmtree(dummy_output_path) else: os.remove(dummy_output_path) t_env.connect(FileSystem().path(dummy_output_path)) \ .with_format(OldCsv() .field('key', DataTypes.STRING())) \ .with_schema(Schema() .field('key', DataTypes.STRING())) \ .create_temporary_table('train_sink') statement_set.add_insert( "train_sink", table.select("build_index(uuid, feature_data)")) return []
def test_register_table_source_and_register_table_sink(self): self.env.set_parallelism(1) source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] self.prepare_csv_source(source_path, data, field_types, field_names) sink_path = os.path.join(self.tempdir + '/streaming2.csv') if os.path.isfile(sink_path): os.remove(sink_path) t_env = self.t_env # register_table_source t_env.connect(FileSystem().path(source_path))\ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .register_table_source("source") # register_table_sink t_env.connect(FileSystem().path(sink_path))\ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .register_table_sink("sink") t_env.scan("source") \ .select("a + 1, b, c") \ .insert_into("sink") self.t_env.execute("test") with open(sink_path, 'r') as f: lines = f.read() assert lines == '2,Hi,Hello\n' + '3,Hello,Hello\n'
def process(self, execution_context: flink.ExecutionContext, input_list: List[Table] = None) -> List[Table]: input_file = os.path.join(os.getcwd(), 'resources', 'word_count.txt') t_env = execution_context.table_env t_env.connect(FileSystem().path(input_file)) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') return [t_env.from_path('mySource')]
def execute(self, function_context: FlinkFunctionContext) -> Table: example_meta: af.ExampleMeta = function_context.get_example_meta() t_env = function_context.get_table_env() t_env.connect(FileSystem().path(example_meta.batch_uri)) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') return t_env.from_path('mySource')
def test_register_temporary_table(self): self.t_env.get_config().get_configuration().set_string( "parallelism.default", "1") source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] self.prepare_csv_source(source_path, data, field_types, field_names) sink_path = os.path.join(self.tempdir + '/streaming2.csv') if os.path.isfile(sink_path): os.remove(sink_path) t_env = self.t_env t_env.connect(FileSystem().path(source_path))\ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .create_temporary_table("source") t_env.connect(FileSystem().path(sink_path))\ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .create_temporary_table("sink") t_env.from_path("source").select("a + 1, b, c").execute_insert( "sink").wait() with open(sink_path, 'r') as f: lines = f.read() assert lines == '2,Hi,Hello\n' + "3,Hello,Hello\n"
def word_count(): result = wikipedia.page("New York City") content = result.summary t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) print(add.add(10,5)) print("Word Count"); # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) #sink_ddl = """ # create table Results( # word VARCHAR, # `count` BIGINT # ) with ( # 'connector.type' = 'filesystem', # 'format.type' = 'csv', # 'connector.path' = '{}' # ) # """.format(result_path) t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('Results') #t_env.sql_update(sink_ddl) elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute("word_count")
def test_with_schema(self): descriptor = self.t_env.connect(FileSystem()) descriptor = descriptor.with_format(OldCsv()).with_schema(Schema().field("a", "INT")) properties = descriptor.to_properties() expected = {'schema.0.name': 'a', 'schema.0.data-type': 'INT', 'format.type': 'csv', 'format.property-version': '1', 'connector.type': 'filesystem', 'connector.property-version': '1'} assert properties == expected
def run(self): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = StreamTableEnvironment.create(exec_env, t_config) t_env.connect(FileSystem().path('/tmp/input')) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') model = Model.fromFile('./../batch_ml/model.pmml') t_env.from_path('mySource') \ .group_by('word') \ .select('word, count(1)') \ .insert_into('mySink') t_env.execute("tutorial_job") self.read_data() result = model.predict({ "Sepal_Length": 5.1, "Sepal_Width": 3.5, "Petal_Length": 1.4, "Petal_Width": 0.2 })
def test_in_append_mode(self): descriptor = self.t_env.connect(FileSystem()) descriptor = descriptor\ .with_format(OldCsv())\ .in_append_mode() properties = descriptor.to_properties() expected = {'update-mode': 'append', 'format.type': 'csv', 'format.property-version': '1', 'connector.property-version': '1', 'connector.type': 'filesystem'} assert properties == expected
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("len", DataTypes.INT()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("len", DataTypes.INT()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") t_env.register_java_function("len", "org.apache.flink.udf.UDFLength") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, len(word), count(1) as count") \ .insert_into("Results") t_env.execute("word_count")
def process(self, execution_context: flink.ExecutionContext, input_list: List[Table] = None) -> List[Table]: output_file = os.path.join(os.getcwd(), 'output') if os.path.exists(output_file): os.remove(output_file) t_env = execution_context.table_env statement_set = execution_context.statement_set t_env.connect(FileSystem().path(output_file)) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') statement_set.add_insert('mySink', input_list[0]) return []
def execute(self, function_context: FlinkFunctionContext, input_table: Table) -> None: example_meta: af.ExampleMeta = function_context.get_example_meta() output_file = example_meta.batch_uri if os.path.exists(output_file): os.remove(output_file) t_env = function_context.get_table_env() statement_set = function_context.get_statement_set() t_env.connect(FileSystem().path(output_file)) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') statement_set.add_insert('mySink', input_table)
def word_count(): f1 = open("/home/mnm/flink-1.9.1/1", "r") f2 = open("/home/mnm/flink-1.9.1/2", "r") f3 = open("/home/mnm/flink-1.9.1/3", "r") f4 = open("/home/mnm/flink-1.9.1/4", "r") f5 = open("/home/mnm/flink-1.9.1/5", "r") content = f1.read() + f2.read() + f3.read() + f4.read() + f5.read() t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute("Python batch word count")
def test_field(self): csv = OldCsv() csv.field("a", DataTypes.BIGINT()) csv.field("b", DataTypes.STRING()) csv.field("c", "SQL_TIMESTAMP") properties = csv.to_properties() expected = {'format.fields.0.name': 'a', 'format.fields.0.data-type': 'BIGINT', 'format.fields.1.name': 'b', 'format.fields.1.data-type': 'VARCHAR(2147483647)', 'format.fields.2.name': 'c', 'format.fields.2.data-type': 'TIMESTAMP(3)', 'format.type': 'csv', 'format.property-version': '1'} self.assertEqual(expected, properties)
class main(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # t_env.connect(FileSystem().path('./temp/deviceorientation')) \ # .with_format(OldCsv() # .field('word', DataTypes.STRING())) \ # .with_schema(Schema() # .field('word', DataTypes.STRING())) \ # .create_temporary_table('mySource') my_source_ddl = """ create table mySource ( word VARCHAR ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = './temp/input' ) """ t_env.sql_update(my_source_ddl) t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') t_env.from_path('mySource') \ .group_by('word') \ .select('word, count(1)') \ .insert_into('mySink') t_env.execute("tutorial_job")
from pyflink.datastream import StreamExecutionEnvironment from pyflink.table import StreamTableEnvironment, DataTypes from pyflink.table.descriptors import Schema, OldCsv, FileSystem from pyflink.table.udf import udf env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(env) add = udf(lambda i, j: i + j, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT()) t_env.register_function("add", add) t_env.connect(FileSystem().path('/opt/examples/data/udf_add_input')) \ .with_format(OldCsv() .field('a', DataTypes.BIGINT()) .field('b', DataTypes.BIGINT())) \ .with_schema(Schema() .field('a', DataTypes.BIGINT()) .field('b', DataTypes.BIGINT())) \ .create_temporary_table('mySource') t_env.connect(FileSystem().path('/opt/examples/data/udf_add_output')) \ .with_format(OldCsv() .field('sum', DataTypes.BIGINT())) \ .with_schema(Schema() .field('sum', DataTypes.BIGINT())) \ .create_temporary_table('mySink') t_env.from_path('mySource')\ .select("add(a, b)") \ .insert_into('mySink')
def word_count(): environment_settings = EnvironmentSettings.new_instance().in_batch_mode( ).use_blink_planner().build() t_env = BatchTableEnvironment.create( environment_settings=environment_settings) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) # we should set the Python verison here if `Python` not point t_env.get_config().set_python_executable("python3") t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("city", DataTypes.STRING()) .field("sales_volume", DataTypes.BIGINT()) .field("sales", DataTypes.BIGINT())) \ .with_schema(Schema() .field("city", DataTypes.STRING()) .field("sales_volume", DataTypes.BIGINT()) .field("sales", DataTypes.BIGINT())) \ .register_table_sink("Results") @udf(input_types=DataTypes.STRING(), result_type=DataTypes.ARRAY(DataTypes.STRING())) def split(input_str: str): return input_str.split(",") @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()), DataTypes.INT()], result_type=DataTypes.STRING()) def get(arr, index): return arr[index] t_env.register_function("split", split) t_env.register_function("get", get) t_env.get_config().get_configuration().set_string("parallelism.default", "1") data = [ ("iPhone 11,30,5499,Beijing", ), ("iPhone 11 Pro,20,8699,Guangzhou", ), ("MacBook Pro,10,9999,Beijing", ), ("AirPods Pro,50,1999,Beijing", ), ("MacBook Pro,10,11499,Shanghai", ), ("iPhone 11,30,5999,Shanghai", ), ("iPhone 11 Pro,20,9999,Shenzhen", ), ("MacBook Pro,10,13899,Hangzhou", ), ("iPhone 11,10,6799,Beijing", ), ("MacBook Pro,10,18999,Beijing", ), ("iPhone 11 Pro,10,11799,Shenzhen", ), ("MacBook Pro,10,22199,Shanghai", ), ("AirPods Pro,40,1999,Shanghai", ) ] t_env.from_elements(data, ["line"]) \ .select("split(line) as str_array") \ .select("get(str_array, 3) as city, " "get(str_array, 1).cast(LONG) as count, " "get(str_array, 2).cast(LONG) as unit_price") \ .select("city, count, count * unit_price as total_price") \ .group_by("city") \ .select("city, " "sum(count) as sales_volume, " "sum(total_price) as sales") \ .insert_into("Results") t_env.execute("word_count")
env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create( env, environment_settings = EnvironmentSettings.new_instance() .use_blink_planner() .build(), ) result_path = '/notebooks/output.csv' print('Results directory:', result_path) t_env.connect(FileSystem().path(result_path)).with_format( OldCsv() .field_delimiter(',') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT()) ).with_schema( Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT()) ).register_table_sink( 'Results' ) elements = [(word, 1) for word in content.split(' ')] t_env.from_elements(elements, ['word', 'count']).group_by('word').select( 'word, count(1) as count' ).insert_into('Results')
from pyflink.dataset import ExecutionEnvironment from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment from pyflink.table.descriptors import Schema, OldCsv, FileSystem exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(2) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) t_env.connect(FileSystem().path('input')) \ .with_format(OldCsv() .line_delimiter(' ') .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .register_table_source("inputSource") t_env.connect(FileSystem().path('output')) \ .with_format(OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\ .with_schema(Schema().field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\ .register_table_sink('sink') t_env.scan('inputSource').group_by('word').select('word, count(1)').insert_into('sink') t_env.execute('my first job')