def data_stream_word_count_demo(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) env.set_runtime_mode(RuntimeExecutionMode.BATCH) input_path = '/opt/examples/datastream/input/word_count_input' output_path = '/opt/examples/datastream/output/data_stream_word_count' file_source = FileSource\ .for_record_stream_format( StreamFormat.text_line_format(), input_path) \ .process_static_file_set() \ .build() file_sink = FileSink \ .for_row_format(output_path, Encoder.simple_string_encoder()) \ .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \ .build() ds = env.from_source( source=file_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name='file_source', type_info=Types.STRING()) ds.map(lambda a: Row(a, 1), output_type=Types.ROW([Types.STRING(), Types.INT()])) \ .key_by(lambda a: a[0]) \ .reduce(lambda a, b: Row(a[0], a[1] + b[1])) \ .sink_to(file_sink) env.execute('9-data_stream_word_count')
def state_access_demo(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) env.set_runtime_mode(RuntimeExecutionMode.BATCH) seq_num_source = NumberSequenceSource(1, 10) output_path = '/opt/examples/datastream/output/state_access' file_sink = FileSink \ .for_row_format(output_path, Encoder.simple_string_encoder()) \ .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \ .build() ds = env.from_source( source=seq_num_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name='seq_num_source', type_info=Types.LONG()) ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \ .key_by(lambda a: a[0]) \ .map(MyMapFunction(), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \ .key_by(lambda a: a[0]) \ .process(MyKeyedProcessFunction(), Types.LONG()) \ .sink_to(file_sink) env.execute('11-data_stream_state_access')
def batch_seq_num_test(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(2) env.set_runtime_mode(RuntimeExecutionMode.BATCH) seq_num_source = NumberSequenceSource(1, 1000) output_path = '/opt/examples/output/batch_seq_num' file_sink = FileSink \ .for_row_format(output_path, Encoder.simple_string_encoder()) \ .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \ .build() ds = env.from_source( source=seq_num_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name='file_source', type_info=Types.LONG()) ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \ .key_by(lambda a: a[0]) \ .reduce(lambda a, b: Row(a[0], a[1] + b[1])) \ .sink_to(file_sink) env.execute('9-data_stream_batch_seq_num')
def test_stream_file_sink(self): self.env.set_parallelism(2) ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW([Types.STRING(), Types.INT()])) ds.map( lambda a: a[0], Types.STRING()).add_sink( StreamingFileSink.for_row_format(self.tempdir, Encoder.simple_string_encoder()) .with_rolling_policy( RollingPolicy.default_rolling_policy( part_size=1024 * 1024 * 1024, rollover_interval=15 * 60 * 1000, inactivity_interval=5 * 60 * 1000)) .with_output_file_config( OutputFileConfig.OutputFileConfigBuilder() .with_part_prefix("prefix") .with_part_suffix("suffix").build()).build()) self.env.execute("test_streaming_file_sink") results = [] import os for root, dirs, files in os.walk(self.tempdir, topdown=True): for file in files: self.assertTrue(file.startswith('.prefix')) self.assertTrue('suffix' in file) path = root + "/" + file with open(path) as infile: for line in infile: results.append(line) expected = ['deeefg\n', 'bdc\n', 'ab\n', 'cfgs\n'] results.sort() expected.sort() self.assertEqual(expected, results)
def test_file_sink(self): base_path = "/tmp/1.txt" encoder = Encoder.simple_string_encoder() file_sink_builder = FileSink.for_row_format(base_path, encoder) file_sink = file_sink_builder\ .with_bucket_check_interval(1000) \ .with_bucket_assigner(BucketAssigner.base_path_bucket_assigner()) \ .with_rolling_policy(RollingPolicy.on_checkpoint_rolling_policy()) \ .with_output_file_config( OutputFileConfig.builder().with_part_prefix("pre").with_part_suffix("suf").build())\ .build() buckets_builder_field = \ load_java_class("org.apache.flink.connector.file.sink.FileSink"). \ getDeclaredField("bucketsBuilder") buckets_builder_field.setAccessible(True) buckets_builder = buckets_builder_field.get( file_sink.get_java_function()) self.assertEqual("DefaultRowFormatBuilder", buckets_builder.getClass().getSimpleName()) row_format_builder_clz = load_java_class( "org.apache.flink.connector.file.sink.FileSink$RowFormatBuilder") encoder_field = row_format_builder_clz.getDeclaredField("encoder") encoder_field.setAccessible(True) self.assertEqual( "SimpleStringEncoder", encoder_field.get(buckets_builder).getClass().getSimpleName()) interval_field = row_format_builder_clz.getDeclaredField( "bucketCheckInterval") interval_field.setAccessible(True) self.assertEqual(1000, interval_field.get(buckets_builder)) bucket_assigner_field = row_format_builder_clz.getDeclaredField( "bucketAssigner") bucket_assigner_field.setAccessible(True) self.assertEqual( "BasePathBucketAssigner", bucket_assigner_field.get( buckets_builder).getClass().getSimpleName()) rolling_policy_field = row_format_builder_clz.getDeclaredField( "rollingPolicy") rolling_policy_field.setAccessible(True) self.assertEqual( "OnCheckpointRollingPolicy", rolling_policy_field.get( buckets_builder).getClass().getSimpleName()) output_file_config_field = row_format_builder_clz.getDeclaredField( "outputFileConfig") output_file_config_field.setAccessible(True) output_file_config = output_file_config_field.get(buckets_builder) self.assertEqual("pre", output_file_config.getPartPrefix()) self.assertEqual("suf", output_file_config.getPartSuffix())
def run_consumer(output_path): env = StreamExecutionEnvironment.get_execution_environment() # write all the data to one file env.set_parallelism(1) # get the credit card data dataset = datasets.CreditCard() # create a small collection of items i = 0 num_of_items = 2000 items = [] for x, y in dataset: if i == num_of_items: break i += 1 items.append((json.dumps(x), y)) credit_stream = env.from_collection(collection=items, type_info=Types.ROW( [Types.STRING(), Types.STRING()])) # detect fraud in transactions fraud_data = credit_stream.map(lambda data: \ json.dumps(requests.post('http://localhost:9000/predict', \ json={'x': data[0], 'y': data[1]}).json()), \ output_type=Types.STRING()) # save the results to a file fraud_data.sink_to(sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder()).build()) # submit for execution env.execute()
def test_file_sink(self): base_path = "/tmp/1.txt" encoder = Encoder.simple_string_encoder() file_sink_builder = FileSink.for_row_format(base_path, encoder) file_sink = file_sink_builder\ .with_bucket_check_interval(1000) \ .with_bucket_assigner(BucketAssigner.base_path_bucket_assigner()) \ .with_rolling_policy(RollingPolicy.on_checkpoint_rolling_policy()) \ .with_output_file_config( OutputFileConfig.builder().with_part_prefix("pre").with_part_suffix("suf").build())\ .enable_compact(FileCompactStrategy.builder() .enable_compaction_on_checkpoint(3) .set_size_threshold(1024) .set_num_compact_threads(2) .build(), FileCompactor.concat_file_compactor(b'\n')) \ .build() buckets_builder_field = \ load_java_class("org.apache.flink.connector.file.sink.FileSink"). \ getDeclaredField("bucketsBuilder") buckets_builder_field.setAccessible(True) buckets_builder = buckets_builder_field.get( file_sink.get_java_function()) self.assertEqual("DefaultRowFormatBuilder", buckets_builder.getClass().getSimpleName()) row_format_builder_clz = load_java_class( "org.apache.flink.connector.file.sink.FileSink$RowFormatBuilder") encoder_field = row_format_builder_clz.getDeclaredField("encoder") encoder_field.setAccessible(True) self.assertEqual( "SimpleStringEncoder", encoder_field.get(buckets_builder).getClass().getSimpleName()) interval_field = row_format_builder_clz.getDeclaredField( "bucketCheckInterval") interval_field.setAccessible(True) self.assertEqual(1000, interval_field.get(buckets_builder)) bucket_assigner_field = row_format_builder_clz.getDeclaredField( "bucketAssigner") bucket_assigner_field.setAccessible(True) self.assertEqual( "BasePathBucketAssigner", bucket_assigner_field.get( buckets_builder).getClass().getSimpleName()) rolling_policy_field = row_format_builder_clz.getDeclaredField( "rollingPolicy") rolling_policy_field.setAccessible(True) self.assertEqual( "OnCheckpointRollingPolicy", rolling_policy_field.get( buckets_builder).getClass().getSimpleName()) output_file_config_field = row_format_builder_clz.getDeclaredField( "outputFileConfig") output_file_config_field.setAccessible(True) output_file_config = output_file_config_field.get(buckets_builder) self.assertEqual("pre", output_file_config.getPartPrefix()) self.assertEqual("suf", output_file_config.getPartSuffix()) compact_strategy_field = row_format_builder_clz.getDeclaredField( "compactStrategy") compact_strategy_field.setAccessible(True) compact_strategy = compact_strategy_field.get(buckets_builder) self.assertEqual(3, compact_strategy.getNumCheckpointsBeforeCompaction()) self.assertEqual(1024, compact_strategy.getSizeThreshold()) self.assertEqual(2, compact_strategy.getNumCompactThreads()) file_compactor_field = row_format_builder_clz.getDeclaredField( "fileCompactor") file_compactor_field.setAccessible(True) file_compactor = file_compactor_field.get(buckets_builder) self.assertEqual("ConcatFileCompactor", file_compactor.getClass().getSimpleName()) concat_file_compactor_clz = load_java_class( "org.apache.flink.connector.file.sink.compactor.ConcatFileCompactor" ) file_delimiter_field = concat_file_compactor_clz.getDeclaredField( "fileDelimiter") file_delimiter_field.setAccessible(True) file_delimiter = file_delimiter_field.get(file_compactor) self.assertEqual(b'\n', file_delimiter)