예제 #1
0
    def test_side_output_late_data(self):
        self.env.set_parallelism(1)
        config = Configuration(j_configuration=get_j_env_configuration(
            self.env._j_stream_execution_environment))
        config.set_integer('python.fn-execution.bundle.size', 1)
        jvm = get_gateway().jvm
        watermark_strategy = WatermarkStrategy(
            jvm.org.apache.flink.api.common.eventtime.WatermarkStrategy.
            forGenerator(jvm.org.apache.flink.streaming.api.functions.python.
                         eventtime.PerElementWatermarkGenerator.getSupplier())
        ).with_timestamp_assigner(SecondColumnTimestampAssigner())

        tag = OutputTag('late-data',
                        type_info=Types.ROW([Types.STRING(),
                                             Types.INT()]))
        ds1 = self.env.from_collection(
            [('a', 0), ('a', 8), ('a', 4), ('a', 6)],
            type_info=Types.ROW([Types.STRING(), Types.INT()]))
        ds2 = ds1.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda e: e[0]) \
            .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \
            .allowed_lateness(0) \
            .side_output_late_data(tag) \
            .process(CountWindowProcessFunction(),
                     Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()]))
        main_sink = DataStreamTestSinkFunction()
        ds2.add_sink(main_sink)
        side_sink = DataStreamTestSinkFunction()
        ds2.get_side_output(tag).add_sink(side_sink)

        self.env.execute('test_side_output_late_data')
        main_expected = ['(a,0,5,1)', '(a,5,10,2)']
        self.assert_equals_sorted(main_expected, main_sink.get_results())
        side_expected = ['+I[a, 4]']
        self.assert_equals_sorted(side_expected, side_sink.get_results())
예제 #2
0
    def test_event_time_tumbling_window(self):
        data_stream = self.env.from_collection([
            ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 5), ('hi', 8), ('hi', 9),
            ('hi', 15)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())
        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \
            .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_event_time_tumbling_window')
        results = self.test_sink.get_results()
        expected = ['(hi,4)', '(hi,3)', '(hi,1)']
        self.assert_equals_sorted(expected, results)
예제 #3
0
    # define the source
    data_stream = env.from_collection([('hi', 1), ('hi', 2), ('hi', 3),
                                       ('hi', 4), ('hi', 5), ('hi', 8),
                                       ('hi', 9), ('hi', 15)],
                                      type_info=Types.TUPLE(
                                          [Types.STRING(),
                                           Types.INT()]))

    # define the watermark strategy
    watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
        .with_timestamp_assigner(MyTimestampAssigner())

    ds = data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
        .key_by(lambda x: x[0], key_type=Types.STRING()) \
        .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \
        .process(CountWindowProcessFunction(),
                 Types.TUPLE([Types.STRING(), Types.INT(), Types.INT(), Types.INT()]))

    # define the sink
    if output_path is not None:
        ds.sink_to(sink=FileSink.for_row_format(
            base_path=output_path, encoder=Encoder.simple_string_encoder()
        ).with_output_file_config(OutputFileConfig.builder().with_part_prefix(
            "prefix").with_part_suffix(".ext").build()).with_rolling_policy(
                RollingPolicy.default_rolling_policy()).build())
    else:
        print(
            "Printing result to stdout. Use --output to specify output path.")
        ds.print()