Пример #1
0
    def test_stream_file_sink(self):
        self.env.set_parallelism(2)
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)],
                                      type_info=Types.ROW([Types.STRING(), Types.INT()]))
        ds.map(
            lambda a: a[0],
            Types.STRING()).add_sink(
            StreamingFileSink.for_row_format(self.tempdir, SimpleStringEncoder())
                .with_rolling_policy(
                    DefaultRollingPolicy.builder().with_rollover_interval(15 * 60 * 1000)
                .with_inactivity_interval(5 * 60 * 1000)
                .with_max_part_size(1024 * 1024 * 1024).build())
                .with_output_file_config(
                    OutputFileConfig.OutputFileConfigBuilder()
                    .with_part_prefix("prefix")
                    .with_part_suffix("suffix").build()).build())

        self.env.execute("test_streaming_file_sink")

        results = []
        import os
        for root, dirs, files in os.walk(self.tempdir, topdown=True):
            for file in files:
                self.assertTrue(file.startswith('.prefix'))
                self.assertTrue('suffix' in file)
                path = root + "/" + file
                with open(path) as infile:
                    for line in infile:
                        results.append(line)

        expected = ['deeefg\n', 'bdc\n', 'ab\n', 'cfgs\n']
        results.sort()
        expected.sort()
        self.assertEqual(expected, results)
Пример #2
0
def tutorial():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    ds = env.from_collection(collection=[(1, 'aaa'), (2, 'bbb')],
                             type_info=Types.ROW([Types.INT(),
                                                  Types.STRING()]))
    ds.add_sink(
        StreamingFileSink.for_row_format('output',
                                         SimpleStringEncoder()).build())
    env.execute("tutorial_job")
Пример #3
0
def tutorial():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    #env.set_stream_time_characteristic(TimeCharacteristic.EventTime)

    ds = env.from_collection(collection=[(1, 'aaa'), (2, 'bbb')],
                             type_info=Types.ROW([Types.INT(),
                                                  Types.STRING()]))
    ds.add_sink(
        StreamingFileSink.for_row_format(
            'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/output',
            SimpleStringEncoder()).build())
    env.execute("tutorial_job")
Пример #4
0
def connect_operators():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_python_executable(
        r"D:/ProgramData/Anaconda3/envs/penter/python.exe")
    ds1 = s_env.from_collection(
        [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
        type_info=Types.ROW([Types.INT(),
                             Types.STRING(),
                             Types.STRING()]))
    ds2 = s_env.from_collection(
        [(3, 'Hi2', 'Hello2'), (4, 'Hello2', 'Hi2')],
        type_info=Types.ROW([Types.INT(),
                             Types.STRING(),
                             Types.STRING()]))

    # Connect DataStream,DataStream → ConnectedStreams
    #cs = ds1.connect(ds2).map(MyCoMapFunction()) # , output_type=Types.INT()
    cs = ds1.connect(ds2).flat_map(
        MyCoFlatMapFunction())  # , output_type=Types.INT()
    cs.add_sink(
        StreamingFileSink.for_row_format('/tmp/output',
                                         SimpleStringEncoder()).build())
    print(s_env.get_execution_plan())
Пример #5
0
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import FlinkKafkaConsumer, StreamingFileSink
from pyflink.common.typeinfo import Types
from pyflink.datastream.functions import MapFunction

s_env = StreamExecutionEnvironment.get_execution_environment()
s_env.set_parallelism(1)
ti = Types.ROW_NAMED(
    ["app", 'busi', 'date', 'ip'],
    [Types.STRING(),
     Types.STRING(),
     Types.BIG_INT(),
     Types.STRING()])
builder = JsonRowDeserializationSchema.builder()
builder.type_info(ti)
jrds = builder.ignore_parse_errors().build()
fkc = FlinkKafkaConsumer(topics="ULS-BUSI-LOG-dev",
                         deserialization_schema=jrds,
                         properties={
                             "bootstrap.servers": "10.100.1.16:9192",
                             "group.id": "123",
                             "auto.offset.reset": "earliest"
                         })
fkc.set_start_from_earliest()
src = s_env.add_source(fkc).map(lambda x: x.get("values"))
src.add_sink(
    StreamingFileSink.for_row_format('C:\\tmp\\pyoutput',
                                     SimpleStringEncoder()).build())
##
s_env.execute("123")
Пример #6
0
def ds_operators():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_python_executable(
        r"D:/ProgramData/Anaconda3/envs/penter/python.exe")
    ds = s_env.from_collection(
        [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
        type_info=Types.ROW([Types.INT(),
                             Types.STRING(),
                             Types.STRING()]))
    """
    map
    flat_map
    filter
    key_by DataStream → KeyedStream
    reduce KeyedStream → DataStream
    union DataStream* → DataStream
    connect DataStream,DataStream → ConnectedStreams
    转换元组:
    project
    分区:
    partition_custom 自定义分区
    shuffle 随机分区 根据均匀分布随机划分元素。
    rebalance 轮询分区
    rescale 重新分区
    broadcast 向每个分区广播元素
    随意定制
    process 只有在KeyedStream上应用ProcessFunction时,才可以访问键控状态和计时器TimerService(相当于java的windows)。
    其它
    start_new_chain
    disable_chaining
    slot_sharing_group
    """
    ds.rescale()
    ds.map()
    ds.flat_map()
    ds.filter()
    # KeyBy DataStream → KeyedStream
    # Reduce KeyedStream → DataStream
    ds = s_env.from_collection([(1, 'a'), (2, 'a'), (3, 'a'), (4, 'b')],
                               type_info=Types.ROW(
                                   [Types.INT(), Types.STRING()]))
    ds.key_by(lambda a: a[1]) \
        .reduce(lambda a, b: Row(a[0] + b[0], b[1]))
    # 广播
    ds.broadcast()
    # project 只有元组ds才可以
    ds = s_env.from_collection([[1, 2, 3, 4], [5, 6, 7, 8]],
                               type_info=Types.TUPLE([
                                   Types.INT(),
                                   Types.INT(),
                                   Types.INT(),
                                   Types.INT()
                               ]))
    # 输出元组的1,3索引
    ds.project(1, 3).map(lambda x: (x[0], x[1] + 1)).add_sink()

    # 存储
    ds.add_sink(
        StreamingFileSink.for_row_format(
            '/tmp/output', SimpleStringEncoder()).with_rolling_policy(
                DefaultRollingPolicy.builder().with_rollover_interval(
                    15 * 60 * 1000).with_inactivity_interval(
                        5 * 60 * 1000).with_max_part_size(1024 * 1024 *
                                                          1024).build()).
        with_output_file_config(
            OutputFileConfig.OutputFileConfigBuilder().with_part_prefix(
                "prefix").with_part_suffix("suffix").build()).build())
    s_env.execute('ds_operators')