def _build_parquet_avro_job(self, record_schema, parquet_file_name): ds = self.env.from_source( FileSource.for_record_stream_format( AvroParquetReaders.for_generic_record(record_schema), parquet_file_name).build(), WatermarkStrategy.for_monotonous_timestamps(), "parquet-source") ds.map(PassThroughMapFunction()).add_sink(self.test_sink)
def _build_parquet_columnar_job(self, row_type: RowType): source = FileSource.for_bulk_file_format( ParquetColumnarRowInputFormat(row_type, Configuration(), 10, True, False), self.parquet_file_name ).build() ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'parquet-source') ds.map(lambda e: e).add_sink(self.test_sink)
def _build_csv_job(self, schema): source = FileSource.for_record_stream_format( CsvReaderFormat.for_schema(schema), self.csv_file_name).build() ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'csv-source') ds.map(PassThroughMapFunction(), output_type=Types.PICKLED_BYTE_ARRAY()) \ .add_sink(self.test_sink)
def test_file_source(self): stream_format = StreamFormat.text_line_format() paths = ["/tmp/1.txt", "/tmp/2.txt"] file_source_builder = FileSource.for_record_stream_format( stream_format, *paths) file_source = file_source_builder\ .monitor_continuously(Duration.of_days(1)) \ .set_file_enumerator(FileEnumeratorProvider.default_splittable_file_enumerator()) \ .set_split_assigner(FileSplitAssignerProvider.locality_aware_split_assigner()) \ .build() continuous_setting = file_source.get_java_function( ).getContinuousEnumerationSettings() self.assertIsNotNone(continuous_setting) self.assertEqual(Duration.of_days(1), Duration(continuous_setting.getDiscoveryInterval())) input_paths_field = \ load_java_class("org.apache.flink.connector.file.src.AbstractFileSource"). \ getDeclaredField("inputPaths") input_paths_field.setAccessible(True) input_paths = input_paths_field.get(file_source.get_java_function()) self.assertEqual(len(input_paths), len(paths)) self.assertEqual(str(input_paths[0]), paths[0]) self.assertEqual(str(input_paths[1]), paths[1])
def _build_parquet_columnar_job(self, row_type: RowType, parquet_file_name: str): source = FileSource.for_bulk_file_format( ParquetColumnarRowInputFormat(Configuration(), row_type, 10, True, True), parquet_file_name).build() ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'parquet-source') ds.map(PassThroughMapFunction()).add_sink(self.test_sink)
def _build_csv_job(self, schema: CsvSchema, lines): with open(self.csv_file_name, 'w') as f: for line in lines: f.write(line) source = FileSource.for_record_stream_format( CsvReaderFormat.for_schema(schema), self.csv_file_name).build() ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'csv-source') sink = FileSink.for_bulk_format( self.csv_dir_name, CsvBulkWriters.for_schema(schema)).build() ds.sink_to(sink)
def word_count(input_path, output_path): env = StreamExecutionEnvironment.get_execution_environment() env.set_runtime_mode(RuntimeExecutionMode.BATCH) # write all the data to one file env.set_parallelism(1) # define the source if input_path is not None: ds = env.from_source( source=FileSource.for_record_stream_format( StreamFormat.text_line_format(), input_path).process_static_file_set().build(), watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name="file_source") else: print("Executing word_count example with default input data set.") print("Use --input to specify file input.") ds = env.from_collection(word_count_data) def split(line): yield from line.split() # compute word count ds = ds.flat_map(split) \ .map(lambda i: (i, 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .key_by(lambda i: i[0]) \ .reduce(lambda i, j: (i[0], i[1] + j[1])) # define the sink if output_path is not None: ds.sink_to(sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder() ).with_output_file_config(OutputFileConfig.builder().with_part_prefix( "prefix").with_part_suffix(".ext").build()).with_rolling_policy( RollingPolicy.default_rolling_policy()).build()) else: print( "Printing result to stdout. Use --output to specify output path.") ds.print() # submit for execution env.execute()