def pandas_udaf(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source with watermark definition ds = env.from_collection( collection=[(Instant.of_epoch_milli(1000), 'Alice', 110.1), (Instant.of_epoch_milli(4000), 'Bob', 30.2), (Instant.of_epoch_milli(3000), 'Alice', 20.0), (Instant.of_epoch_milli(2000), 'Bob', 53.1), (Instant.of_epoch_milli(5000), 'Alice', 13.1), (Instant.of_epoch_milli(3000), 'Bob', 3.1), (Instant.of_epoch_milli(7000), 'Bob', 16.1), (Instant.of_epoch_milli(10000), 'Alice', 20.1)], type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()])) table = t_env.from_data_stream( ds, Schema.new_builder().column_by_expression( "ts", "CAST(f0 AS TIMESTAMP_LTZ(3))").column( "f1", DataTypes.STRING()).column("f2", DataTypes.FLOAT()).watermark( "ts", "ts - INTERVAL '3' SECOND").build()).alias( "ts", "name", "price") # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('name', DataTypes.STRING()).column( 'total_price', DataTypes.FLOAT()).column( 'w_start', DataTypes.TIMESTAMP_LTZ()).column( 'w_end', DataTypes.TIMESTAMP_LTZ()).build()).build()) @udaf(result_type=DataTypes.FLOAT(), func_type="pandas") def mean_udaf(v): return v.mean() # define the tumble window operation table = table.window(Tumble.over(lit(5).seconds).on(col("ts")).alias("w")) \ .group_by(table.name, col('w')) \ .select(table.name, mean_udaf(table.price), col("w").start, col("w").end) # submit for execution table.execute_insert('sink') \ .wait()
def test_equals_and_hash(self): config1 = StreamExecutionEnvironment.get_execution_environment( ).get_config() config2 = StreamExecutionEnvironment.get_execution_environment( ).get_config() self.assertEqual(config1, config2) self.assertEqual(hash(config1), hash(config2)) config1.set_parallelism(12) config2.set_parallelism(11) self.assertNotEqual(config1, config2) self.assertNotEqual(hash(config1), hash(config2)) config2.set_parallelism(12) self.assertEqual(config1, config2) self.assertEqual(hash(config1), hash(config2))
def setUp(self) -> None: self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(1) self.env.add_jars("file://{}".format(find_jar_path())) self.t_env = StreamTableEnvironment.create(self.env) self.source_table = self.t_env.from_descriptor( TableDescriptor.for_connector("datagen").schema( Schema.new_builder().column("x", DataTypes.INT()).column( "a", DataTypes.INT()).build()).option( "fields.x.kind", "sequence").option("fields.x.start", "1").option( "fields.x.end", "100").option("fields.a.kind", "sequence").option( "fields.a.start", "101").option("fields.a.end", "200").build())
def test_create_table_environment(self): table_config = TableConfig() table_config.set_max_generated_code_length(32000) table_config.set_null_check(False) table_config.set_timezone("Asia/Shanghai") env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(env, table_config) readed_table_config = t_env.get_config() self.assertFalse(readed_table_config.get_null_check()) self.assertEqual(readed_table_config.get_max_generated_code_length(), 32000) self.assertEqual(readed_table_config.get_timezone(), "Asia/Shanghai")
def kafka_to_mysql(): """ 从Kafka Source读取Json数据,然后导入到Mysql。{"msg": "welcome flink users..."} cp """ settings = EnvironmentSettings.new_instance().in_streaming_mode( ).use_blink_planner().build() env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=settings) t_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) source_ddl = """ CREATE TABLE kafka_source ( msg STRING ) WITH ( 'connector' = 'kafka-0.11', 'topic' = 'cdn-log', 'properties.bootstrap.servers' = 'kafka:9092', 'format' = 'json', 'scan.startup.mode' = 'latest-offset' ) """ sink_ddl = """ CREATE TABLE mysql_sink ( msg STRING ) WITH ( 'connector' = 'jdbc', 'url' = 'jdbc:mysql://mysql:3306/flinkdb?characterEncoding=utf-8&useSSL=false', 'table-name' = 'cdn_log', 'username' = 'root', 'password' = '123456', 'sink.buffer-flush.max-rows' = '1' ) """ # 注册source和sink t_env.execute_sql(source_ddl) t_env.execute_sql(sink_ddl) # 数据提取 tab = t_env.from_path("kafka_source") # 这里我们暂时先使用 标注了 deprecated 的API, 因为新的异步提交测试有待改进... tab.insert_into("mysql_sink") # 执行作业 t_env.execute("kafka_to_mysql")
def test_stream(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build() t_env = StreamTableEnvironment.create(env, environment_settings=environment_settings) # t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.size", 1000000) # t_env.get_config().get_configuration().set_boolean("table.exec.mini-batch.enabled", True) # t_env.get_config().get_configuration().set_integer("table.exec.mini-batch.allow-latency", 1000) # t_env.get_config().get_configuration().set_integer("table.exec.mini-batch.size", 100000) t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.time", 1000) t_env.get_config().get_configuration().set_boolean("pipeline.object-reuse", True) t_env.create_temporary_function("python_avg", MeanAggregateFunction()) t_env.create_java_temporary_system_function("java_avg", "com.alibaba.flink.function.JavaAvg") num_rows = 10000000 t_env.execute_sql(f""" CREATE TABLE source ( id INT, num INT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) WITH ( 'connector' = 'Range', 'start' = '1', 'end' = '{num_rows}', 'step' = '1', 'partition' = '200' ) """) t_env.register_table_sink( "sink", PrintTableSink( ["num", "value"], [DataTypes.INT(False), DataTypes.FLOAT(False)], 1000000)) # .group_by("num") \ # .select("num % 1000 as num, id") \ result = t_env.from_path("source") \ .select("num % 1000 as num, id") \ .group_by("num") \ .select("num, python_avg(id)") result.insert_into("sink") beg_time = time.time() t_env.execute("Python UDF") print("PyFlink stream group agg consume time: " + str(time.time() - beg_time))
def tumble_window_demo(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source with watermark definition ds = env.from_collection( collection=[(Instant.of_epoch_milli(1000), 'Alice', 110.1), (Instant.of_epoch_milli(4000), 'Bob', 30.2), (Instant.of_epoch_milli(3000), 'Alice', 20.0), (Instant.of_epoch_milli(2000), 'Bob', 53.1), (Instant.of_epoch_milli(5000), 'Alice', 13.1), (Instant.of_epoch_milli(3000), 'Bob', 3.1), (Instant.of_epoch_milli(7000), 'Bob', 16.1), (Instant.of_epoch_milli(10000), 'Alice', 20.1)], type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()])) table = t_env.from_data_stream( ds, Schema.new_builder().column_by_expression( "ts", "CAST(f0 AS TIMESTAMP(3))").column( "f1", DataTypes.STRING()).column("f2", DataTypes.FLOAT()).watermark( "ts", "ts - INTERVAL '3' SECOND").build()).alias( "ts, name, price") # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('name', DataTypes.STRING()).column( 'total_price', DataTypes.FLOAT()).build()).build()) # define the over window operation table = table.over_window( Over.partition_by("name") .order_by("ts") .preceding(row_interval(2)) .following(CURRENT_ROW) .alias('w')) \ .select(table.name, table.price.max.over(col('w'))) # submit for execution table.execute_insert('sink') \ .wait()
def __init__(self): # self.feature_extractor = DemoFeatureExtractor() self.settings = EnvironmentSettings.new_instance().in_streaming_mode( ).use_blink_planner().build() self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(1) self.table_env = StreamTableEnvironment.create( self.env, environment_settings=self.settings) self.table_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) self.table_env.add_python_file('feature_extractors') source_table = open('feature_extractors/source.sql', 'r').read() sink_table = open('feature_extractors/sink.sql', 'r').read() self.table_env.execute_sql(source_table) self.table_env.execute_sql(sink_table)
def run(): # 获取运行环境 env = StreamExecutionEnvironment.get_execution_environment() # 设置运行环境 env_setting(env) # 设置并行度 env.set_parallelism(1) # 添加jar文件 windows 系统改成自己的jar 所在文件地址 kafka_jar = f"file://{os.getcwd()}/jars/flink-connector-kafka_2.11-1.12.0.jar" kafka_client = f"file://{os.getcwd()}/jars/kafka-clients-2.4.1.jar" env.add_jars(kafka_jar, kafka_client) # 添加文件 env.add_python_file(f"{os.getcwd()}/config_file.py") env.add_python_file(f"{os.getcwd()}/env_setting.py") # 使用打包的运行环境 (自定义环境打包) env.add_python_archive(f"{os.getcwd()}/venv.zip") env.set_python_executable("env.zip/venv/bin/python") # 使用本地运行环境 # env.set_python_executable(PYTHON_EXECUTABLE) env.disable_operator_chaining() kafka_product_properties = get_kafka_Producer_properties(TEST_KAFKA_SERVERS) properties = get_kafka_customer_properties(TEST_KAFKA_SERVERS, TEST_GROUP_ID) data_stream = env.add_source( FlinkKafkaConsumer(topics=TEST_KAFKA_TOPIC, properties=properties, deserialization_schema=SimpleStringSchema()) \ .set_commit_offsets_on_checkpoints(True) ) \ .name(f"消费{TEST_KAFKA_TOPIC}主题数据") data_stream.map(lambda value: json.loads(s=value, encoding="utf-8")) \ .name("转成json") \ .map(lambda value: json.dumps(value), BasicTypeInfo.STRING_TYPE_INFO()) \ .name("转成str") \ .add_sink(FlinkKafkaProducer(topic=TEST_SINK_TOPIC, producer_config=kafka_product_properties, serialization_schema=SimpleStringSchema())) \ .name("存入kafka") env.execute("测试pyflink 读取和写入kafka")
def python_data_stream_example(): env = StreamExecutionEnvironment.get_execution_environment() # Set the parallelism to be one to make sure that all data including fired timer and normal data # are processed by the same worker and the collected result would be in order which is good for # assertion. env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) t_env = StreamTableEnvironment.create(stream_execution_environment=env) create_kafka_source_ddl = """ CREATE TABLE payment_msg( createTime VARCHAR, rt as TO_TIMESTAMP(createTime), orderId BIGINT, payAmount DOUBLE, payPlatform INT, provinceId INT, WATERMARK FOR rt as rt - INTERVAL '2' SECOND ) WITH ( 'connector.type' = 'kafka', 'connector.version' = 'universal', 'connector.topic' = 'timer-stream-source', 'connector.properties.bootstrap.servers' = 'localhost:9092', 'connector.properties.group.id' = 'test_3', 'connector.startup-mode' = 'earliest-offset', 'format.type' = 'json' ) """ t_env.execute_sql(create_kafka_source_ddl) t = t_env.from_path("payment_msg").select("createTime, orderId, payAmount, payPlatform," " provinceId") source_type_info = Types.ROW([ Types.STRING(), Types.LONG(), Types.DOUBLE(), Types.INT(), Types.INT()]) ds = t_env.to_append_stream(table=t, type_info=source_type_info) producer_props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source'} kafka_producer = FlinkKafkaProducer("timer-stream-sink", SimpleStringSchema(), producer_props) ds.key_by(MyKeySelector(), key_type_info=Types.LONG()) \ .process(MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(kafka_producer) env.execute_async("test data stream timer")
def basic_operations(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) # define the source ds = env.from_collection(collection=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], type_info=Types.ROW_NAMED( ["id", "info"], [Types.INT(), Types.STRING()])) # map def update_tel(data): # parse the json json_data = json.loads(data.info) json_data['tel'] += 1 return data.id, json.dumps(json_data) show(ds.map(update_tel), env) # (1, '{"name": "Flink", "tel": 124, "addr": {"country": "Germany", "city": "Berlin"}}') # (2, '{"name": "hello", "tel": 136, "addr": {"country": "China", "city": "Shanghai"}}') # (3, '{"name": "world", "tel": 125, "addr": {"country": "USA", "city": "NewYork"}}') # (4, '{"name": "PyFlink", "tel": 33, "addr": {"country": "China", "city": "Hangzhou"}}') # filter show(ds.filter(lambda data: data.id == 1).map(update_tel), env) # (1, '{"name": "Flink", "tel": 124, "addr": {"country": "Germany", "city": "Berlin"}}') # key by show( ds.map(lambda data: (json.loads(data.info)['addr']['country'], json.loads(data.info)['tel'])).key_by( lambda data: data[0]).sum(1), env)
def create_table_env(self): stream_env = StreamExecutionEnvironment.get_execution_environment() stream_env.set_stream_time_characteristic( TimeCharacteristic.ProcessingTime) stream_env.set_parallelism(1) t_env = StreamTableEnvironment.create( stream_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) statement_set = t_env.create_statement_set() t_env.get_config().set_python_executable(execute_path) t_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) t_env.get_config().get_configuration().set_string( "taskmanager.memory.task.off-heap.size", '512m') t_env.get_config().get_configuration().set_string("rest.port", '8081') return stream_env, t_env, statement_set
def usePyFlinkEnv(parallelism: int = None, flinkHome: str = None) -> MLEnv: global _mlenv if in_custom_env(): print("Warning: usePyFlinkEnv will do nothing, since useCustomEnv is used to initialize MLEnv.") return _mlenv resetEnv() if flinkHome is not None: g_config["flink_home"] = flinkHome # Let PyFlink to launch gateway, and warn users to add jars to pyflink lib path print("Warning: You're running the script with 'getMLEnv'. " "You have to manually add Alink jars to PyFlink lib path to make the script work.") import pyflink # noinspection PyUnresolvedReferences gateway = pyflink.java_gateway.get_gateway() # noinspection PyUnresolvedReferences pyflink.java_gateway.import_flink_view(gateway) # In PyFlink 1.9 and 1.10, PyFlink doesn't start callback server. # We start callback server manually. success = gateway.start_callback_server( callback_server_parameters=CallbackServerParameters(port=0, daemonize=True, daemonize_connections=True)) if success: callback_server_port = gateway.get_callback_server().get_listening_port() gateway.java_gateway_server.resetCallbackClient( gateway.java_gateway_server.getCallbackClient().getAddress(), callback_server_port) set_java_gateway(gateway) from pyflink.dataset import ExecutionEnvironment from pyflink.datastream import StreamExecutionEnvironment benv = ExecutionEnvironment.get_execution_environment() senv = StreamExecutionEnvironment.get_execution_environment() if parallelism is not None: benv.set_parallelism(parallelism) senv.set_parallelism(parallelism) # noinspection PyProtectedMember _mlenv = setup_py_mlenv(gateway, benv._j_execution_environment, senv._j_stream_execution_environment) return _mlenv
def left_outer_join_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) st_env = StreamTableEnvironment.create(s_env) left = st_env.from_elements( [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], ["a", "b", "c"]).select("a, b, c") right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], ["d", "e", "f"]).select("d, e, f") result = left.left_outer_join(right, "a = d").select("a, b, e") # use custom retract sink connector sink = TestRetractSink(["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]) st_env.register_table_sink("sink", sink) result.insert_into("sink") st_env.execute("left outer join streaming")
def tutorial(): env = StreamExecutionEnvironment.get_execution_environment() jar_files = ( 'flink-connector-kafka_2.12-1.12.2.jar', 'kafka-clients-2.4.1.jar', ) jar_paths = tuple('file://' + os.path.abspath(os.path.join(cur_path, jar_file)) for jar_file in jar_files) env.add_jars(*jar_paths) env.add_classpaths(*jar_paths) env.set_parallelism(1) ds = env.add_source( FlinkKafkaConsumer(TOPIC, SimpleStringSchema(), KAFKA_PROPERTIES)) ds.print() env.execute("tutorial_job")
def custom_test_source_demo(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/custom_test_source_demo.csv" if os.path.exists(result_file): os.remove(result_file) custom_connector = CustomConnectorDescriptor('pyflink-test', 1, False) st_env.connect(custom_connector) \ .with_schema( Schema() .field("a", DataTypes.STRING()) ).register_table_source("source") st_env.register_table_sink( "result", CsvTableSink(["a"], [DataTypes.STRING()], result_file)) orders = st_env.scan("source") orders.insert_into("result") st_env.execute("custom test source demo")
def word_count(input_path, output_path): env = StreamExecutionEnvironment.get_execution_environment() env.set_runtime_mode(RuntimeExecutionMode.BATCH) # write all the data to one file env.set_parallelism(1) # define the source if input_path is not None: ds = env.from_source( source=FileSource.for_record_stream_format( StreamFormat.text_line_format(), input_path).process_static_file_set().build(), watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name="file_source") else: print("Executing word_count example with default input data set.") print("Use --input to specify file input.") ds = env.from_collection(word_count_data) def split(line): yield from line.split() # compute word count ds = ds.flat_map(split) \ .map(lambda i: (i, 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .key_by(lambda i: i[0]) \ .reduce(lambda i, j: (i[0], i[1] + j[1])) # define the sink if output_path is not None: ds.sink_to(sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder() ).with_output_file_config(OutputFileConfig.builder().with_part_prefix( "prefix").with_part_suffix(".ext").build()).with_rolling_policy( RollingPolicy.default_rolling_policy()).build()) else: print( "Printing result to stdout. Use --output to specify output path.") ds.print() # submit for execution env.execute()
def main(args): func = args[1] version = args[2] index_name = '_'.join(["performance_pyflink", func, version]) env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build() t_env = StreamTableEnvironment.create(env, environment_settings=environment_settings) t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.size", 300000) t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.time", 1000) t_env.get_config().get_configuration().set_boolean("pipeline.object-reuse", True) t_env.get_config().get_configuration().set_boolean("python.fn-execution.memory.managed", True) # t_env.register_table_sink( # "sink", # PrintTableSink( # ["id"], # [DataTypes.INT(False)])) @udf(input_types=[DataTypes.INT(False)], result_type=DataTypes.INT(False)) def inc(x): return x + 1 t_env.register_function("inc", inc) t_env.register_java_function("java_inc", "com.alibaba.flink.function.JavaInc") register_source(t_env) register_sink(t_env, index_name) source = t_env.from_path("source") if func == 'java': table = source.select("java_inc(a) as a") else: table = source.select("inc(a) as a") table.filter("a % 1000000 = 0") \ .insert_into("sink") beg_time = time.time() t_env.execute("Python UDF") print("PyFlink Python UDF inc() consume time: " + str(time.time() - beg_time))
def ride_duration(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment \ .create(s_env, environment_settings=EnvironmentSettings .new_instance() .in_streaming_mode() .use_blink_planner().build()) # register source and sink register_rides_source(st_env) register_ride_duration_sink(st_env) # register java udf (isInNYC, timeDiff) # 注:timeDiff对应类的路径是:com.ververica.sql_training.udfs.TimeDiff st_env.register_java_function("isInNYC", "com.ververica.sql_training.udfs.IsInNYC") ??? #注册timeDiff函数
def mixing_use_of_datastream_and_table(): # use StreamTableEnvironment instead of TableEnvironment when mixing use of table & datastream env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source t_env.create_temporary_table( 'source', TableDescriptor.for_connector('datagen').schema( Schema.new_builder().column('id', DataTypes.BIGINT()).column( 'data', DataTypes.STRING()).build()).option("number-of-rows", "10").build()) # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('a', DataTypes.BIGINT()).build()).build()) @udf(result_type=DataTypes.BIGINT()) def length(data): return len(data) # perform table api operations table = t_env.from_path("source") table = table.select(col('id'), length(col('data'))) # convert table to datastream and perform datastream api operations ds = t_env.to_data_stream(table) ds = ds.map(lambda i: i[0] + i[1], output_type=Types.LONG()) # convert datastream to table and perform table api operations as you want table = t_env.from_data_stream( ds, Schema.new_builder().column("f0", DataTypes.BIGINT()).build()) # execute table.execute_insert('sink') \ .wait()
def distinct_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" st_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) orders = st_env.scan("Orders") result = orders.select("a, b").distinct() # use custom retract sink connector sink = TestRetractSink(["a", "b"], [DataTypes.STRING(), DataTypes.INT()]) st_env.register_table_sink("sink", sink) result.insert_into("sink") st_env.execute("distinct streaming")
def from_kafka_to_kafka_demo(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment \ .create(s_env, environment_settings=EnvironmentSettings .new_instance() .in_streaming_mode() .use_blink_planner().build()) # register source and sink register_rides_source(st_env) register_rides_sink(st_env) # query st_env.from_path("source").select("*").insert_into("sink") # execute st_env.execute("from_kafka_to_kafka")
def popular_destination_query(): env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env) t_env.execute_sql( create_table_ddl( "WATERMARK FOR pickupTime AS pickupTime - INTERVAL '30' SECONDS")) query = f"""SELECT destLocationId, wstart, wend, cnt FROM (SELECT destLocationId, HOP_START(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wstart, HOP_END(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wend, COUNT(destLocationId) AS cnt FROM (SELECT pickupTime, destLocationId FROM TaxiRide) GROUP BY destLocationId, HOP(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) ) WHERE cnt > {args.threshold} """ results = t_env.sql_query(query) t_env.to_append_stream( results, Types.ROW_NAMED(['destLocationId', 'wstart', 'wend', 'cnt'], [ Types.INT(), Types.SQL_TIMESTAMP(), Types.SQL_TIMESTAMP(), Types.LONG() ])).print() env.execute('Popular-Destination')
def data_transfer(): input_file = sys.argv[1] test_size = sys.argv[2] run_num = sys.argv[3] file_string = 'file:///home/tito/workspace/inputs/' + str(input_file) perf_file = './perf/' + str(test_size) + '/perf_' + str(run_num) + '.csv' start = time.time() env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) ds = env.read_text_file(file_string , 'UTF-8') ds.add_sink(StreamingFileSink .for_row_format('/home/tito/workspace/outputs', SimpleStringEncoder()) .build()) env.execute('data_transfer_job') end = time.time() perf_file = open(perf_file, 'w+') perf_file.write(f'{start},{end}\n')
def log_processing(): env = StreamExecutionEnvironment.get_execution_environment() env_settings = EnvironmentSettings.Builder().use_blink_planner().build() t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=env_settings) # specify connector and format jars t_env.get_config().get_configuration().set_string("pipeline.jars", "file://" + FAT_JAR_PATH) source_ddl = """ CREATE TABLE source_table( a VARCHAR, b INT ) WITH ( 'connector' = 'kafka', 'topic' = 'source_topic', 'properties.bootstrap.servers' = 'localhost:9092', 'properties.group.id' = 'test_group', 'scan.startup.mode' = 'earliest-offset', 'format' = 'json' ) """ sink_ddl = """ CREATE TABLE sink_table( a VARCHAR ) WITH ( 'connector' = 'kafka', 'topic' = 'sink_topic', 'properties.bootstrap.servers' = 'localhost:9092', 'format' = 'json' ) """ t_env.execute_sql(source_ddl) t_env.execute_sql(sink_ddl) t_env.from_path("source_table").select("a").execute_insert( "sink_table").wait()
def in_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) st_env = StreamTableEnvironment.create(s_env) left = st_env.from_elements( [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")], ["a", "b", "c"]).select("a, b, c") right = st_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")], ["a", "b", "c"]).select("a") result = left.where("a.in(%s)" % right).select("b, c") # another way # st_env.register_table("RightTable", right) # result = left.where("a.in(RightTable)") # use custom retract sink connector sink = TestRetractSink(["a", "b"], [DataTypes.STRING(), DataTypes.STRING()]) st_env.register_table_sink("sink", sink) result.insert_into("sink") st_env.execute("in streaming")
def python_data_stream_example(): env = StreamExecutionEnvironment.get_execution_environment() source_type_info = Types.ROW([Types.STRING(), Types.INT()]) json_row_deserialization_schema = JsonRowDeserializationSchema.builder()\ .type_info(source_type_info).build() source_topic = 'test-python-data-stream-source' consumer_props = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source' } kafka_consumer_1 = FlinkKafkaConsumer(source_topic, json_row_deserialization_schema, consumer_props) kafka_consumer_1.set_start_from_earliest() source_stream_1 = env.add_source(kafka_consumer_1).name('kafka source 1') mapped_type_info = Types.ROW([Types.STRING(), Types.INT(), Types.INT()]) keyed_stream = source_stream_1.map(add_one, output_type=mapped_type_info) \ .key_by(lambda x: x[2]) flat_mapped_stream = keyed_stream.flat_map(m_flat_map, result_type=mapped_type_info) flat_mapped_stream.name("flat-map").set_parallelism(3) sink_topic = 'test-python-data-stream-sink' producer_props = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-1' } json_row_serialization_schema = JsonRowSerializationSchema.builder()\ .with_type_info(mapped_type_info).build() kafka_producer = FlinkKafkaProducer( topic=sink_topic, producer_config=producer_props, serialization_schema=json_row_serialization_schema) flat_mapped_stream.add_sink(kafka_producer) env.execute_async("test data stream to kafka")
def python_data_stream_example(): env = StreamExecutionEnvironment.get_execution_environment() # Set the parallelism to be one to make sure that all data including fired timer and normal data # are processed by the same worker and the collected result would be in order which is good for # assertion. env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) type_info = Types.ROW_NAMED( ['createTime', 'orderId', 'payAmount', 'payPlatform', 'provinceId'], [Types.LONG(), Types.LONG(), Types.DOUBLE(), Types.INT(), Types.INT()]) json_row_schema = JsonRowDeserializationSchema.builder().type_info( type_info).build() kafka_props = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source' } kafka_consumer = FlinkKafkaConsumer("timer-stream-source", json_row_schema, kafka_props) kafka_producer = FlinkKafkaProducer("timer-stream-sink", SimpleStringSchema(), kafka_props) watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(5))\ .with_timestamp_assigner(KafkaRowTimestampAssigner()) kafka_consumer.set_start_from_earliest() ds = env.add_source(kafka_consumer).assign_timestamps_and_watermarks( watermark_strategy) ds.key_by(MyKeySelector(), key_type=Types.LONG()) \ .process(MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(kafka_producer) env.execute_async("test data stream timer")
def custom_test_sink_demo(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) st_env = StreamTableEnvironment.create(s_env) left = st_env.from_elements([(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], ["a", "b", "c"]).select("a, b, c") right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], ["d", "e", "f"]).select("d, e, f") result = left.left_outer_join(right, "a = d").select("a, b, e") # use custom retract sink connector custom_connector = CustomConnectorDescriptor('pyflink-test', 1, False) st_env.connect(custom_connector) \ .with_schema( Schema() .field("a", DataTypes.BIGINT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ).register_table_sink("sink") result.insert_into("sink") st_env.execute("custom test sink demo")
def run_consumer(output_path): env = StreamExecutionEnvironment.get_execution_environment() # write all the data to one file env.set_parallelism(1) # get the credit card data dataset = datasets.CreditCard() # create a small collection of items i = 0 num_of_items = 2000 items = [] for x, y in dataset: if i == num_of_items: break i += 1 items.append((json.dumps(x), y)) credit_stream = env.from_collection(collection=items, type_info=Types.ROW( [Types.STRING(), Types.STRING()])) # detect fraud in transactions fraud_data = credit_stream.map(lambda data: \ json.dumps(requests.post('http://localhost:9000/predict', \ json={'x': data[0], 'y': data[1]}).json()), \ output_type=Types.STRING()) # save the results to a file fraud_data.sink_to(sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder()).build()) # submit for execution env.execute()