def python_data_stream_example(): env = StreamExecutionEnvironment.get_execution_environment() # Set the parallelism to be one to make sure that all data including fired timer and normal data # are processed by the same worker and the collected result would be in order which is good for # assertion. env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) type_info = Types.ROW_NAMED(['createTime', 'orderId', 'payAmount', 'payPlatform', 'provinceId'], [Types.LONG(), Types.LONG(), Types.DOUBLE(), Types.INT(), Types.INT()]) json_row_schema = JsonRowDeserializationSchema.builder().type_info(type_info).build() kafka_props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source'} kafka_consumer = FlinkKafkaConsumer("timer-stream-source", json_row_schema, kafka_props) kafka_producer = FlinkKafkaProducer("timer-stream-sink", SimpleStringSchema(), kafka_props) watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(5))\ .with_timestamp_assigner(KafkaRowTimestampAssigner()) kafka_consumer.set_start_from_earliest() ds = env.add_source(kafka_consumer).assign_timestamps_and_watermarks(watermark_strategy) ds.key_by(MyKeySelector(), key_type_info=Types.LONG()) \ .process(MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(kafka_producer) env.execute_async("test data stream timer")
def read_from_kafka(env): deserialization_schema = JsonRowDeserializationSchema.Builder() \ .type_info(Types.ROW([Types.INT(), Types.STRING()])) \ .build() kafka_consumer = FlinkKafkaConsumer( topics='test_json_topic', deserialization_schema=deserialization_schema, properties={ 'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group_1' }) kafka_consumer.set_start_from_earliest() env.add_source(kafka_consumer).print() env.execute()
def test_add_classpaths(self): # find kafka connector jars flink_source_root = _find_flink_source_root() jars_abs_path = flink_source_root + '/flink-connectors/flink-sql-connector-kafka' specific_jars = glob.glob(jars_abs_path + '/target/flink*.jar') specific_jars = [ 'file://' + specific_jar for specific_jar in specific_jars ] self.env.add_classpaths(*specific_jars) source_topic = 'test_source_topic' props = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group' } type_info = Types.ROW([Types.INT(), Types.STRING()]) # Test for kafka consumer deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(type_info=type_info).build() # It Will raise a ClassNotFoundException if the kafka connector is not added into the # pipeline classpaths. kafka_consumer = FlinkKafkaConsumer(source_topic, deserialization_schema, props) self.env.add_source(kafka_consumer).print() self.env.get_execution_plan()
def python_data_stream_example(): env = StreamExecutionEnvironment.get_execution_environment() source_type_info = Types.ROW([Types.STRING(), Types.INT()]) json_row_deserialization_schema = JsonRowDeserializationSchema.builder()\ .type_info(source_type_info).build() source_topic = 'test-python-data-stream-source' consumer_props = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source' } kafka_consumer_1 = FlinkKafkaConsumer(source_topic, json_row_deserialization_schema, consumer_props) kafka_consumer_1.set_start_from_earliest() source_stream_1 = env.add_source(kafka_consumer_1).name('kafka source 1') mapped_type_info = Types.ROW([Types.STRING(), Types.INT(), Types.INT()]) keyed_stream = source_stream_1.map(add_one, output_type=mapped_type_info) \ .key_by(lambda x: x[2]) flat_mapped_stream = keyed_stream.flat_map(m_flat_map, result_type=mapped_type_info) flat_mapped_stream.name("flat-map").set_parallelism(3) sink_topic = 'test-python-data-stream-sink' producer_props = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-1' } json_row_serialization_schema = JsonRowSerializationSchema.builder()\ .with_type_info(mapped_type_info).build() kafka_producer = FlinkKafkaProducer( topic=sink_topic, producer_config=producer_props, serialization_schema=json_row_serialization_schema) flat_mapped_stream.add_sink(kafka_producer) env.execute_async("test data stream to kafka")
def read_from_kafka(env): deserialization_schema = AvroRowDeserializationSchema( avro_schema_string=""" { "type": "record", "name": "TestRecord", "fields": [ {"name": "id", "type": "int"}, {"name": "name", "type": "string"} ] }""") kafka_consumer = FlinkKafkaConsumer( topics='test_avro_topic', deserialization_schema=deserialization_schema, properties={ 'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group_1' }) kafka_consumer.set_start_from_earliest() env.add_source(kafka_consumer).print() env.execute()
def run(): # 获取运行环境 env = StreamExecutionEnvironment.get_execution_environment() # 设置运行环境 env_setting(env) # 设置并行度 env.set_parallelism(1) # 添加jar文件 windows 系统改成自己的jar 所在文件地址 kafka_jar = f"file://{os.getcwd()}/jars/flink-connector-kafka_2.11-1.12.0.jar" kafka_client = f"file://{os.getcwd()}/jars/kafka-clients-2.4.1.jar" env.add_jars(kafka_jar, kafka_client) # 添加文件 env.add_python_file(f"{os.getcwd()}/config_file.py") env.add_python_file(f"{os.getcwd()}/env_setting.py") # 使用打包的运行环境 (自定义环境打包) env.add_python_archive(f"{os.getcwd()}/venv.zip") env.set_python_executable("env.zip/venv/bin/python") # 使用本地运行环境 # env.set_python_executable(PYTHON_EXECUTABLE) env.disable_operator_chaining() kafka_product_properties = get_kafka_Producer_properties(TEST_KAFKA_SERVERS) properties = get_kafka_customer_properties(TEST_KAFKA_SERVERS, TEST_GROUP_ID) data_stream = env.add_source( FlinkKafkaConsumer(topics=TEST_KAFKA_TOPIC, properties=properties, deserialization_schema=SimpleStringSchema()) \ .set_commit_offsets_on_checkpoints(True) ) \ .name(f"消费{TEST_KAFKA_TOPIC}主题数据") data_stream.map(lambda value: json.loads(s=value, encoding="utf-8")) \ .name("转成json") \ .map(lambda value: json.dumps(value), BasicTypeInfo.STRING_TYPE_INFO()) \ .name("转成str") \ .add_sink(FlinkKafkaProducer(topic=TEST_SINK_TOPIC, producer_config=kafka_product_properties, serialization_schema=SimpleStringSchema())) \ .name("存入kafka") env.execute("测试pyflink 读取和写入kafka")
def tutorial(): env = StreamExecutionEnvironment.get_execution_environment() jar_files = ( 'flink-connector-kafka_2.12-1.12.2.jar', 'kafka-clients-2.4.1.jar', ) jar_paths = tuple('file://' + os.path.abspath(os.path.join(cur_path, jar_file)) for jar_file in jar_files) env.add_jars(*jar_paths) env.add_classpaths(*jar_paths) env.set_parallelism(1) ds = env.add_source( FlinkKafkaConsumer(TOPIC, SimpleStringSchema(), KAFKA_PROPERTIES)) ds.print() env.execute("tutorial_job")
from pyflink.datastream import StreamExecutionEnvironment from pyflink.datastream.connectors import FlinkKafkaConsumer, StreamingFileSink from pyflink.common.typeinfo import Types from pyflink.datastream.functions import MapFunction s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) ti = Types.ROW_NAMED( ["app", 'busi', 'date', 'ip'], [Types.STRING(), Types.STRING(), Types.BIG_INT(), Types.STRING()]) builder = JsonRowDeserializationSchema.builder() builder.type_info(ti) jrds = builder.ignore_parse_errors().build() fkc = FlinkKafkaConsumer(topics="ULS-BUSI-LOG-dev", deserialization_schema=jrds, properties={ "bootstrap.servers": "10.100.1.16:9192", "group.id": "123", "auto.offset.reset": "earliest" }) fkc.set_start_from_earliest() src = s_env.add_source(fkc).map(lambda x: x.get("values")) src.add_sink( StreamingFileSink.for_row_format('C:\\tmp\\pyoutput', SimpleStringEncoder()).build()) ## s_env.execute("123")
def demo01(): # 创建一个执行环境,该环境表示程序当前正在执行。如果程序是独立调用的,则方法返回本地执行环境。 # 1:创建一个流处理的执行环境,如果在本地启动则创建本地执行环境,如果在集群启动则创建集群执行环境 env = StreamExecutionEnvironment.get_execution_environment() # 添加添加到程序的每个用户代码类加载器的类路径中的url列表。路径必须指定一个协议(例如file://),并且可以在所有节点上访问 env.add_classpaths("file://lib") # 添加将被上传到集群并由作业引用的jar文件列表。 .set_string("pipeline.jars", 'file://' + dir_kafka_sql_connect) env.add_jars("file://jars") # 添加python存档文件。该文件将被解压到python UDF worker的工作目录中。 # 目前只支持zip格式,例如zip、jar、whl、egg等 # 会先解压zip -r py_env.zip py_env.zip env.add_python_archive("py_env.zip") # 如果python UDF依赖于集群中不存在的特定python版本,则可以使用此方法上传虚拟环境。注意,上传环境中包含的python解释器的路径应该通过该方法指定 env.set_python_executable("py_env.zip/py_env/bin/python") # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3 # or env.add_python_archive("py_env.zip", "myenv") env.set_python_executable("myenv/py_env/bin/python") # the files contained in the archive file can be accessed in UDF """ def my_udf(): with open("myenv/py_env/data/data.txt") as f: ... """ # 相当于 pip download -d cached_dir -r requirements.txt --no-binary :all: env.set_python_requirements("requirements.txt", "cached_dir") # 添加一个python依赖项,它可以是python文件、python包或本地目录。它们将被添加到python UDF工作者的PYTHONPATH中。请确保可以导入这些依赖项。 env.add_python_file("") # 添加source #1. add_source ds = env.add_source( FlinkKafkaConsumer( "source_topic", JsonRowDeserializationSchema.builder().type_info( type_info=Types.ROW([Types.INT(), Types.STRING()])).build(), { 'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group' })) # 2. from_collection ds = env.from_collection([ 1, 2, 3, ], Types.INT()) # 3. 从文件 ds = env.read_text_file("hdfs://host:port/file/path") # 禁用operator chaining env.disable_operator_chaining() """ Flink 可以非常高效的进行有状态流的计算,通过使用 Flink 内置的 Keyed State 和 Operator State,保存每个算子的状态。 默认情况下,状态是存储在 JVM 的堆内存中,如果系统中某个环节发生了错误,宕机,这个时候所有的状态都会丢失,并且无法恢复,会导致整个系统的数据计算发生错误。 此时就需要 Checkpoint 来保障系统的容错。Checkpoint 过程,就是把算子的状态周期性持久化的过程。 在系统出错后恢复时,就可以从 checkpoint 中恢复每个算子的状态,从上次消费的地方重新开始消费和计算。从而可以做到在高效进行计算的同时还可以保证数据不丢失,只计算一次。 最少一次 AT_LEAST_ONCE 如果假定是传输过程出现问题,而服务器没有收到数据,这样time out之后重传数据。但这可能是返回成功消息的时候出问题,而此时服务器已经收到数据,这样会因为重传而收到多份数据,这就是 at least once 严格一次 EXACTLY_ONCE 最多一次(At-most-once)、最少一次(At-least-once),以及严格一次(Exactly-once) Checkpoint 必要的两个条件 1. 需要支持重放一定时间范围内数据的数据源,比如:kafka 。 因为容错机制就是在任务失败后自动从最近一次成功的 checkpoint 处恢复任务,此时需要把任务失败前消费的数据再消费一遍。 假设数据源不支持重放,那么数据还未写到存储中就丢了,任务恢复后,就再也无法重新消费这部分丢了的数据了。 2. 需要一个存储来保存持久化的状态,如:Hdfs,本地文件。可以在任务失败后,从存储中恢复 checkpoint 数据。 https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/stream/state/checkpointing.html https://ci.apache.org/projects/flink/flink-docs-release-1.12/api/python/pyflink.datastream.html#pyflink.datastream.CheckpointConfig """ # 每 300s 做一次 checkpoint env.enable_checkpointing(300000, CheckpointingMode.AT_LEAST_ONCE) # MemoryStateBackend FsStateBackend CustomStateBackend env.set_state_backend(RocksDBStateBackend("file://var/checkpoints/")) # set mode to exactly-once (this is the default) env.get_checkpoint_config().set_checkpointing_mode( CheckpointingMode.EXACTLY_ONCE) # 两次 checkpoint 的间隔时间至少为 500ms,默认是 0,立即进行下一次 checkpoint make sure 500 ms of progress happen between checkpoints env.get_checkpoint_config().set_min_pause_between_checkpoints(500) # checkpoint 必须在 60s 内结束,否则被丢弃 checkpoints have to complete within one minute, or are discarded env.get_checkpoint_config().set_checkpoint_timeout(60000) # 同一时间只能允许有一个 checkpoint allow only one checkpoint to be in progress at the same time env.get_checkpoint_config().set_max_concurrent_checkpoints(1) # 当 Flink 任务取消时,保留外部保存的 checkpoint 信息 enable externalized checkpoints which are retained after job cancellation env.get_checkpoint_config().enable_externalized_checkpoints( ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) # 当有较新的 Savepoint 时,作业也会从 Checkpoint 处恢复 allow job recovery fallback to checkpoint when there is a more recent savepoint env.get_checkpoint_config().set_prefer_checkpoint_for_recovery(True) # 允许实验性的功能:非对齐的 checkpoint,以提升性能 enables the experimental unaligned checkpoints # CheckpointingMode.EXACTLY_ONCE时才能启用 env.get_checkpoint_config().enable_unaligned_checkpoints() # env.get_checkpoint_config().disable_unaligned_checkpoints() 等同env.get_checkpoint_config().enable_unaligned_checkpoints(False) env.get_checkpoint_interval( ) #等同 env.get_checkpoint_config().get_checkpoint_interval() """ """ # https://ci.apache.org/projects/flink/flink-docs-release-1.12/api/python/pyflink.common.html#pyflink.common.ExecutionConfig # bin/flink run -Dexecution.runtime-mode=BATCH examples/streaming/WordCount.jar env.get_config().set_execution_mode(ExecutionMode.BATCH) env.get_config().disable_auto_generated_uids( ) # enable_auto_generated_uids # 自己设置uid ds.uid("xx") # 设置从此环境创建的所有流的时间特性,例如,处理时间、事件时间或摄取时间。 # 如果将特征设置为EventTime的incertiontime,则将设置默认值水印更新间隔为200毫秒。 env.set_stream_time_characteristic(TimeCharacteristic.EventTime) #设置时间分配器 env.get_config().set_auto_watermark_interval(200) # 每200ms发出一个watermark env.get_config().set_global_job_parameters( {"environment.checkpoint_interval": "1000"}) env.get_config().set_restart_strategy( RestartStrategies.fixed_delay_restart(10, 1000)) # 执行 env.execute("job name") # 异步执行 jobClient = env.execute_async("job name") jobClient.get_job_execution_result().result() """ 设置输出缓冲区刷新的最大时间频率(毫秒)。默认情况下,输出缓冲区会频繁刷新,以提供较低的延迟,并帮助流畅的开发人员体验。设置该参数可以产生三种逻辑模式: 正整数触发该整数周期性刷新 0 触发每个记录之后的刷新,从而最大限度地减少延迟(最好不要设置为0 可以设置一个接近0的数值,比如5或者10) -1 仅在输出缓冲区已满时才触发刷新,从而最大化吞吐量 """ # 输出缓冲区刷新的最大时间频率(毫秒) env.get_buffer_timeout() env.set_buffer_timeout(10) # 获取执行计划的json,复制到https://flink.apache.org/visualizer/ env.get_execution_plan()