def word_count(): tmp_dir = tempfile.gettempdir() source_path = tmp_dir + '/streaming.csv' if os.path.isfile(source_path): os.remove(source_path) content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" with open(source_path, 'w') as f: for word in content.split(" "): f.write(",".join([word, "1"])) f.write("\n") f.flush() f.close() t_config = TableConfig.Builder().as_batch_execution().set_parallelism( 1).build() t_env = TableEnvironment.create(t_config) field_names = ["word", "cout"] field_types = [DataTypes.STRING, DataTypes.LONG] # register Orders table in table environment t_env.register_table_source( "Word", CsvTableSource(source_path, field_names, field_types)) # register Results table in table environment tmp_dir = tempfile.gettempdir() tmp_csv = tmp_dir + '/streaming2.csv' if os.path.isfile(tmp_csv): os.remove(tmp_csv) t_env.register_table_sink("Results", field_names, field_types, CsvTableSink(tmp_csv)) t_env.scan("Word") \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute()
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig.Builder().as_batch_execution().build() t_env = TableEnvironment.create(t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute()
def test_end_to_end(): tmp_dir = tempfile.gettempdir() source_path = tmp_dir + '/streaming.csv' if os.path.isfile(source_path): os.remove(source_path) with open(source_path, 'w') as f: lines = '1,hi,hello\n' + '2,hi,hello\n' f.write(lines) f.close() _find_flink_home() print("using %s as FLINK_HOME..." % os.environ["FLINK_HOME"]) t_config = TableConfig.Builder().as_streaming_execution().set_parallelism(1).build() t_env = TableEnvironment.get_table_environment(t_config) field_names = ["a", "b", "c"] field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING] # register Orders table in table environment t_env.register_table_source( "Orders", CsvTableSource(source_path, field_names, field_types)) # register Results table in table environment tmp_dir = tempfile.gettempdir() tmp_csv = tmp_dir + '/streaming2.csv' if os.path.isfile(tmp_csv): os.remove(tmp_csv) t_env.register_table_sink( "Results", field_names, field_types, CsvTableSink(tmp_csv)) t_env.scan("Orders") \ .where("a > 0") \ .select("a + 1, b, c") \ .insert_into("Results") t_env.execute() with open(tmp_csv, 'r') as f: lines = f.read() assert lines == '2,hi,hello\n' + '3,hi,hello\n' print("test passed, the log file is under this directory: %s/log" % os.environ["FLINK_HOME"])
def setUp(self): super(PyFlinkBatchTableTestCase, self).setUp() self.t_config = TableConfig.Builder().as_batch_execution( ).set_parallelism(1).build() self.t_env = TableEnvironment.create(self.t_config)
def setUp(self): super(PyFlinkStreamTableTestCase, self).setUp() self.t_config = TableConfig.Builder().as_streaming_execution( ).set_parallelism(4).build() self.t_env = TableEnvironment.get_table_environment(self.t_config)