def test_window_all_reduce_process(self): self.env.set_parallelism(1) data_stream = self.env.from_collection( [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyProcessFunction(ProcessAllWindowFunction): def process(self, context: 'ProcessAllWindowFunction.Context', elements: Iterable[Tuple[str, int]]) -> Iterable[str]: yield "current window start at {}, reduce result {}".format( context.window().start, next(iter(elements)), ) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .window_all(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .reduce(lambda a, b: (a[0], a[1] + b[1]), window_function=MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(self.test_sink) self.env.execute('test_window_all_reduce_process') results = self.test_sink.get_results() expected = [ "current window start at 1, reduce result ('a', 6)", "current window start at 6, reduce result ('a', 23)", "current window start at 15, reduce result ('a', 15)" ] self.assert_equals_sorted(expected, results)
def test_window_aggregate_accumulator_type(self): data_stream = self.env.from_collection( [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyAggregateFunction(AggregateFunction): def create_accumulator(self) -> Tuple[int, str]: return 0, '' def add(self, value: Tuple[str, int], accumulator: Tuple[int, str]) -> Tuple[int, str]: return value[1] + accumulator[0], value[0] def get_result(self, accumulator: Tuple[str, int]): return accumulator[1], accumulator[0] def merge(self, acc_a: Tuple[int, str], acc_b: Tuple[int, str]): return acc_a[0] + acc_b[0], acc_a[1] data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .aggregate(MyAggregateFunction(), accumulator_type=Types.TUPLE([Types.INT(), Types.STRING()]), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_time_window_aggregate_accumulator_type') results = self.test_sink.get_results() expected = ['(a,15)', '(a,3)', '(a,6)', '(b,17)', '(b,3)'] self.assert_equals_sorted(expected, results)
def test_window_aggregate_passthrough(self): data_stream = self.env.from_collection( [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyAggregateFunction(AggregateFunction): def create_accumulator(self) -> Tuple[str, Dict[int, int]]: return '', {0: 0, 1: 0} def add( self, value: Tuple[str, int], accumulator: Tuple[str, Dict[int, int]] ) -> Tuple[str, Dict[int, int]]: number_map = accumulator[1] number_map[value[1] % 2] += 1 return value[0], number_map def get_result( self, accumulator: Tuple[str, Dict[int, int]]) -> Tuple[str, int]: number_map = accumulator[1] return accumulator[0], number_map[0] - number_map[1] def merge( self, acc_a: Tuple[str, Dict[int, int]], acc_b: Tuple[str, Dict[int, int]]) -> Tuple[str, Dict[int, int]]: number_map_a = acc_a[1] number_map_b = acc_b[1] new_number_map = { 0: number_map_a[0] + number_map_b[0], 1: number_map_a[1] + number_map_b[1] } return acc_a[0], new_number_map data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .aggregate(MyAggregateFunction(), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_time_window_aggregate_passthrough') results = self.test_sink.get_results() expected = ['(a,-1)', '(a,0)', '(a,1)', '(b,-1)', '(b,0)'] self.assert_equals_sorted(expected, results)
def test_window_aggregate_process(self): data_stream = self.env.from_collection( [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyAggregateFunction(AggregateFunction): def create_accumulator(self) -> Tuple[int, str]: return 0, '' def add(self, value: Tuple[str, int], accumulator: Tuple[int, str]) -> Tuple[int, str]: return value[1] + accumulator[0], value[0] def get_result(self, accumulator: Tuple[str, int]): return accumulator[1], accumulator[0] def merge(self, acc_a: Tuple[int, str], acc_b: Tuple[int, str]): return acc_a[0] + acc_b[0], acc_a[1] class MyProcessWindowFunction(ProcessWindowFunction): def process(self, key: str, context: ProcessWindowFunction.Context, elements: Iterable[Tuple[str, int]]) -> Iterable[str]: agg_result = next(iter(elements)) yield "key {} timestamp sum {}".format(agg_result[0], agg_result[1]) def clear(self, context: ProcessWindowFunction.Context) -> None: pass data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .aggregate(MyAggregateFunction(), window_function=MyProcessWindowFunction(), accumulator_type=Types.TUPLE([Types.INT(), Types.STRING()]), output_type=Types.STRING()) \ .add_sink(self.test_sink) self.env.execute('test_time_window_aggregate_accumulator_type') results = self.test_sink.get_results() expected = [ 'key a timestamp sum 15', 'key a timestamp sum 3', 'key a timestamp sum 6', 'key b timestamp sum 17', 'key b timestamp sum 3' ] self.assert_equals_sorted(expected, results)
def test_session_window_late_merge(self): data_stream = self.env.from_collection([ ('hi', 0), ('hi', 8), ('hi', 4)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(5))) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_session_window_late_merge') results = self.test_sink.get_results() expected = ['(hi,3)'] self.assert_equals_sorted(expected, results)
def test_window_reduce_passthrough(self): data_stream = self.env.from_collection([ ('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .reduce(lambda a, b: (b[0], a[1] + b[1]), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_time_window_reduce_passthrough') results = self.test_sink.get_results() expected = ['(a,3)', '(a,6)', '(a,15)', '(b,3)', '(b,17)'] self.assert_equals_sorted(expected, results)
def test_event_time_dynamic_gap_session_window(self): self.env.set_parallelism(1) data_stream = self.env.from_collection([ ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 9), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_dynamic_gap(MySessionWindowTimeGapExtractor())) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_event_time_dynamic_gap_session_window') results = self.test_sink.get_results() expected = ['(hi,3)', '(hi,4)'] self.assert_equals_sorted(expected, results)
def test_event_time_session_window_with_purging_trigger(self): data_stream = self.env.from_collection([ ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(3))) \ .trigger(PurgingTrigger.of(EventTimeTrigger.create())) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_event_time_session_window_with_purging_trigger') results = self.test_sink.get_results() expected = ['(hi,1,7,4)', '(hi,8,12,2)', '(hi,15,18,1)'] self.assert_equals_sorted(expected, results)
# define the source data_stream = env.from_collection([('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE( [Types.STRING(), Types.INT()])) # define the watermark strategy watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(MyTimestampAssigner()) ds = data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(5))) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT(), Types.INT(), Types.INT()])) # define the sink if output_path is not None: ds.sink_to(sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder() ).with_output_file_config(OutputFileConfig.builder().with_part_prefix( "prefix").with_part_suffix(".ext").build()).with_rolling_policy( RollingPolicy.default_rolling_policy()).build()) else: print( "Printing result to stdout. Use --output to specify output path.") ds.print()
env = StreamExecutionEnvironment.get_execution_environment() # write all the data to one file env.set_parallelism(1) # define the source data_stream = env.from_collection([ ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # define the watermark strategy watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(MyTimestampAssigner()) ds = data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_dynamic_gap(MySessionWindowTimeGapExtractor())) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT(), Types.INT(), Types.INT()])) # define the sink if output_path is not None: ds.sink_to( sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder()) .with_output_file_config( OutputFileConfig.builder() .with_part_prefix("prefix") .with_part_suffix(".ext") .build()) .with_rolling_policy(RollingPolicy.default_rolling_policy())