示例#1
0
 def _build_parquet_avro_job(self, record_schema, parquet_file_name):
     ds = self.env.from_source(
         FileSource.for_record_stream_format(
             AvroParquetReaders.for_generic_record(record_schema),
             parquet_file_name).build(),
         WatermarkStrategy.for_monotonous_timestamps(), "parquet-source")
     ds.map(PassThroughMapFunction()).add_sink(self.test_sink)
示例#2
0
 def _build_parquet_columnar_job(self, row_type: RowType):
     source = FileSource.for_bulk_file_format(
         ParquetColumnarRowInputFormat(row_type, Configuration(), 10, True, False),
         self.parquet_file_name
     ).build()
     ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'parquet-source')
     ds.map(lambda e: e).add_sink(self.test_sink)
示例#3
0
 def _build_csv_job(self, schema):
     source = FileSource.for_record_stream_format(
         CsvReaderFormat.for_schema(schema), self.csv_file_name).build()
     ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(),
                               'csv-source')
     ds.map(PassThroughMapFunction(), output_type=Types.PICKLED_BYTE_ARRAY()) \
         .add_sink(self.test_sink)
示例#4
0
    def test_file_source(self):
        stream_format = StreamFormat.text_line_format()
        paths = ["/tmp/1.txt", "/tmp/2.txt"]
        file_source_builder = FileSource.for_record_stream_format(
            stream_format, *paths)
        file_source = file_source_builder\
            .monitor_continuously(Duration.of_days(1)) \
            .set_file_enumerator(FileEnumeratorProvider.default_splittable_file_enumerator()) \
            .set_split_assigner(FileSplitAssignerProvider.locality_aware_split_assigner()) \
            .build()

        continuous_setting = file_source.get_java_function(
        ).getContinuousEnumerationSettings()
        self.assertIsNotNone(continuous_setting)
        self.assertEqual(Duration.of_days(1),
                         Duration(continuous_setting.getDiscoveryInterval()))

        input_paths_field = \
            load_java_class("org.apache.flink.connector.file.src.AbstractFileSource"). \
            getDeclaredField("inputPaths")
        input_paths_field.setAccessible(True)
        input_paths = input_paths_field.get(file_source.get_java_function())
        self.assertEqual(len(input_paths), len(paths))
        self.assertEqual(str(input_paths[0]), paths[0])
        self.assertEqual(str(input_paths[1]), paths[1])
示例#5
0
 def _build_parquet_columnar_job(self, row_type: RowType,
                                 parquet_file_name: str):
     source = FileSource.for_bulk_file_format(
         ParquetColumnarRowInputFormat(Configuration(), row_type, 10, True,
                                       True), parquet_file_name).build()
     ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(),
                               'parquet-source')
     ds.map(PassThroughMapFunction()).add_sink(self.test_sink)
示例#6
0
 def _build_csv_job(self, schema: CsvSchema, lines):
     with open(self.csv_file_name, 'w') as f:
         for line in lines:
             f.write(line)
     source = FileSource.for_record_stream_format(
         CsvReaderFormat.for_schema(schema), self.csv_file_name).build()
     ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(),
                               'csv-source')
     sink = FileSink.for_bulk_format(
         self.csv_dir_name, CsvBulkWriters.for_schema(schema)).build()
     ds.sink_to(sink)
示例#7
0
def word_count(input_path, output_path):
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_runtime_mode(RuntimeExecutionMode.BATCH)
    # write all the data to one file
    env.set_parallelism(1)

    # define the source
    if input_path is not None:
        ds = env.from_source(
            source=FileSource.for_record_stream_format(
                StreamFormat.text_line_format(),
                input_path).process_static_file_set().build(),
            watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),
            source_name="file_source")
    else:
        print("Executing word_count example with default input data set.")
        print("Use --input to specify file input.")
        ds = env.from_collection(word_count_data)

    def split(line):
        yield from line.split()

    # compute word count
    ds = ds.flat_map(split) \
           .map(lambda i: (i, 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
           .key_by(lambda i: i[0]) \
           .reduce(lambda i, j: (i[0], i[1] + j[1]))

    # define the sink
    if output_path is not None:
        ds.sink_to(sink=FileSink.for_row_format(
            base_path=output_path, encoder=Encoder.simple_string_encoder()
        ).with_output_file_config(OutputFileConfig.builder().with_part_prefix(
            "prefix").with_part_suffix(".ext").build()).with_rolling_policy(
                RollingPolicy.default_rolling_policy()).build())
    else:
        print(
            "Printing result to stdout. Use --output to specify output path.")
        ds.print()

    # submit for execution
    env.execute()