def demo01(): env = StreamExecutionEnvironment.get_execution_environment() ds = env.from_collection( collection=[(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) # 给Event添加水位 # 1.内置水位生成策略 # 1.1 延迟生成水印: 延迟10s watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness( Duration.of_seconds(10)) # 1.2 单调递增生成水印:这个也就是相当于上述的延迟策略去掉了延迟时间,以event中的时间戳充当了水印。 watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() # 2. event时间的获取 watermark_strategy = WatermarkStrategy.for_monotonous_timestamps( ).with_timestamp_assigner(MyTimestampAssigner()) """ 在某些情况下,由于数据产生的比较少,导致一段时间内没有数据产生,进而就没有水印的生成,导致下游依赖水印的一些操作就会出现问题,比如某一个算子的上游有多个算子, 这种情况下,水印是取其上游两个算子的较小值,如果上游某一个算子因为缺少数据迟迟没有生成水印,就会出现eventtime倾斜问题,导致下游没法触发计算。 所以filnk通过WatermarkStrategy.withIdleness()方法允许用户在配置的时间内(即超时时间内)没有记录到达时将一个流标记为空闲。这样就意味着下游的数据不需要等待水印的到来。 当下次有水印生成并发射到下游的时候,这个数据流重新变成活跃状态。 """ watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness( Duration.of_seconds(10)).with_idleness(Duration.of_seconds(30)) ds.assign_timestamps_and_watermarks(watermark_strategy) ds.print()
def event_timer_timer_demo(): env = StreamExecutionEnvironment.get_execution_environment() ds = env.from_collection( collection=[ (1000, 'Alice', 110.1), (4000, 'Bob', 30.2), (3000, 'Alice', 20.0), (2000, 'Bob', 53.1), (5000, 'Alice', 13.1), (3000, 'Bob', 3.1), (7000, 'Bob', 16.1), (10000, 'Alice', 20.1) ], type_info=Types.TUPLE([Types.LONG(), Types.STRING(), Types.FLOAT()])) ds = ds.assign_timestamps_and_watermarks( WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(2)) .with_timestamp_assigner(MyTimestampAssigner())) # apply the process function onto a keyed stream ds.key_by(lambda value: value[1]) \ .process(Sum()) \ .print() # submit for execution env.execute()
def python_data_stream_example(): env = StreamExecutionEnvironment.get_execution_environment() # Set the parallelism to be one to make sure that all data including fired timer and normal data # are processed by the same worker and the collected result would be in order which is good for # assertion. env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) type_info = Types.ROW_NAMED(['createTime', 'orderId', 'payAmount', 'payPlatform', 'provinceId'], [Types.LONG(), Types.LONG(), Types.DOUBLE(), Types.INT(), Types.INT()]) json_row_schema = JsonRowDeserializationSchema.builder().type_info(type_info).build() kafka_props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source'} kafka_consumer = FlinkKafkaConsumer("timer-stream-source", json_row_schema, kafka_props) kafka_producer = FlinkKafkaProducer("timer-stream-sink", SimpleStringSchema(), kafka_props) watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(5))\ .with_timestamp_assigner(KafkaRowTimestampAssigner()) kafka_consumer.set_start_from_earliest() ds = env.add_source(kafka_consumer).assign_timestamps_and_watermarks(watermark_strategy) ds.key_by(MyKeySelector(), key_type_info=Types.LONG()) \ .process(MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(kafka_producer) env.execute_async("test data stream timer")
def test_with_watermark_alignment(self): jvm = get_gateway().jvm j_watermark_strategy = WatermarkStrategy.no_watermarks( ).with_watermark_alignment( "alignment-group-1", Duration.of_seconds(20), Duration.of_seconds(10))._j_watermark_strategy self.assertTrue( is_instance_of( j_watermark_strategy, jvm.org.apache.flink.api.common. eventtime.WatermarksWithWatermarkAlignment)) alignment_parameters = j_watermark_strategy.getAlignmentParameters() self.assertEqual(alignment_parameters.getWatermarkGroup(), "alignment-group-1") self.assertEqual(alignment_parameters.getMaxAllowedWatermarkDrift(), 20000) self.assertEqual(alignment_parameters.getUpdateInterval(), 10000)
def test_with_idleness(self): jvm = get_gateway().jvm j_watermark_strategy = WatermarkStrategy.no_watermarks().with_idleness( Duration.of_seconds(5))._j_watermark_strategy self.assertTrue( is_instance_of( j_watermark_strategy, jvm.org.apache.flink.api.common. eventtime.WatermarkStrategyWithIdleness)) self.assertEqual( get_field_value(j_watermark_strategy, "idlenessTimeout").toMillis(), 5000)
def test_for_bounded_out_of_orderness(self): jvm = get_gateway().jvm j_watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness( Duration.of_seconds(3))._j_watermark_strategy j_watermark_generator = j_watermark_strategy.createWatermarkGenerator( None) self.assertTrue( is_instance_of( j_watermark_generator, jvm.org.apache.flink.api.common. eventtime.BoundedOutOfOrdernessWatermarks)) self.assertEqual( get_field_value(j_watermark_generator, "outOfOrdernessMillis"), 3000)
def test_pulsar_sink(self): ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) TEST_OPTION_NAME = 'pulsar.producer.chunkingEnabled' pulsar_sink = PulsarSink.builder() \ .set_service_url('pulsar://localhost:6650') \ .set_admin_url('http://localhost:8080') \ .set_producer_name('fo') \ .set_topics('ada') \ .set_serialization_schema( PulsarSerializationSchema.flink_schema(SimpleStringSchema())) \ .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \ .set_topic_routing_mode(TopicRoutingMode.ROUND_ROBIN) \ .delay_sending_message(MessageDelayer.fixed(Duration.of_seconds(12))) \ .set_config(TEST_OPTION_NAME, True) \ .set_properties({'pulsar.producer.batchingMaxMessages': '100'}) \ .build() ds.sink_to(pulsar_sink).name('pulsar sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('pulsar sink: Writer', plan['nodes'][1]['type']) configuration = get_field_value(pulsar_sink.get_java_function(), "sinkConfiguration") self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.client.serviceUrl').string_type(). no_default_value()._j_config_option), 'pulsar://localhost:6650') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.admin.adminUrl').string_type(). no_default_value()._j_config_option), 'http://localhost:8080') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.producer.producerName').string_type( ).no_default_value()._j_config_option), 'fo - %s') j_pulsar_serialization_schema = get_field_value( pulsar_sink.get_java_function(), 'serializationSchema') j_serialization_schema = get_field_value(j_pulsar_serialization_schema, 'serializationSchema') self.assertTrue( is_instance_of( j_serialization_schema, 'org.apache.flink.api.common.serialization.SimpleStringSchema') ) self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.sink.deliveryGuarantee').string_type( ).no_default_value()._j_config_option), 'at-least-once') j_topic_router = get_field_value(pulsar_sink.get_java_function(), "topicRouter") self.assertTrue( is_instance_of( j_topic_router, 'org.apache.flink.connector.pulsar.sink.writer.router.RoundRobinTopicRouter' )) j_message_delayer = get_field_value(pulsar_sink.get_java_function(), 'messageDelayer') delay_duration = get_field_value(j_message_delayer, 'delayDuration') self.assertEqual(delay_duration, 12000) test_option = ConfigOptions.key( TEST_OPTION_NAME).boolean_type().no_default_value() self.assertEqual( configuration.getBoolean(test_option._j_config_option), True) self.assertEqual( configuration.getLong( ConfigOptions.key('pulsar.producer.batchingMaxMessages'). long_type().no_default_value()._j_config_option), 100)