def word_count(input_path, output_path): env = StreamExecutionEnvironment.get_execution_environment() env.set_runtime_mode(RuntimeExecutionMode.BATCH) # write all the data to one file env.set_parallelism(1) # define the source if input_path is not None: ds = env.from_source( source=FileSource.for_record_stream_format(StreamFormat.text_line_format(), input_path) .process_static_file_set().build(), watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name="file_source" ) else: print("Executing word_count example with default input data set.") print("Use --input to specify file input.") ds = env.from_collection(word_count_data) def split(line): yield from line.split() # compute word count ds = ds.flat_map(split) \ .map(lambda i: (i, 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .key_by(lambda i: i[0]) \ .reduce(lambda i, j: (i[0], i[1] + j[1])) # define the sink if output_path is not None: ds.sink_to( sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder()) .with_output_file_config( OutputFileConfig.builder() .with_part_prefix("prefix") .with_part_suffix(".ext") .build()) .with_rolling_policy(RollingPolicy.default_rolling_policy()) .build() ) else: print("Printing result to stdout. Use --output to specify output path.") ds.print() # submit for execution env.execute()
help='Output file to write results to.') argv = sys.argv[1:] known_args, _ = parser.parse_known_args(argv) output_path = known_args.output env = StreamExecutionEnvironment.get_execution_environment() # write all the data to one file env.set_parallelism(1) # define the source data_stream = env.from_collection([('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 5), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE( [Types.STRING(), Types.INT()])) # define the watermark strategy watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(MyTimestampAssigner()) ds = data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT(), Types.INT(), Types.INT()])) # define the sink if output_path is not None: ds.sink_to(sink=FileSink.for_row_format(
dest='output', required=False, help='Output file to write results to.') argv = sys.argv[1:] known_args, _ = parser.parse_known_args(argv) output_path = known_args.output env = StreamExecutionEnvironment.get_execution_environment() # write all the data to one file env.set_parallelism(1) # define the source data_stream = env.from_collection([ (1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'), (6, 'hello'), (6, 'hello')], type_info=Types.TUPLE([Types.INT(), Types.STRING()])) ds = data_stream.key_by(lambda x: x[1], key_type=Types.STRING()) \ .count_window(2) \ .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) # define the sink if output_path is not None: ds.sink_to( sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder()) .with_output_file_config( OutputFileConfig.builder() .with_part_prefix("prefix") .with_part_suffix(".ext")