Пример #1
0
    def test_stream_file_sink(self):
        self.env.set_parallelism(2)
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.ROW(
                                          [Types.STRING(),
                                           Types.INT()]))
        ds.map(lambda a: a[0], Types.STRING()).add_sink(
            StreamingFileSink.for_row_format(
                self.tempdir,
                Encoder.simple_string_encoder()).with_rolling_policy(
                    RollingPolicy.default_rolling_policy(
                        part_size=1024 * 1024 * 1024,
                        rollover_interval=15 * 60 * 1000,
                        inactivity_interval=5 * 60 * 1000)).
            with_output_file_config(
                OutputFileConfig.OutputFileConfigBuilder().with_part_prefix(
                    "prefix").with_part_suffix("suffix").build()).build())

        self.env.execute("test_streaming_file_sink")

        results = []
        import os
        for root, dirs, files in os.walk(self.tempdir, topdown=True):
            for file in files:
                self.assertTrue(file.startswith('.prefix'))
                self.assertTrue('suffix' in file)
                path = root + "/" + file
                with open(path) as infile:
                    for line in infile:
                        results.append(line)

        expected = ['deeefg\n', 'bdc\n', 'ab\n', 'cfgs\n']
        results.sort()
        expected.sort()
        self.assertEqual(expected, results)
Пример #2
0
    def test_jdbc_sink(self):
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)],
                                      type_info=Types.ROW([Types.STRING(), Types.INT()]))
        jdbc_connection_options = JdbcConnectionOptions.JdbcConnectionOptionsBuilder()\
            .with_driver_name('com.mysql.jdbc.Driver')\
            .with_user_name('root')\
            .with_password('password')\
            .with_url('jdbc:mysql://server-name:server-port/database-name').build()

        jdbc_execution_options = JdbcExecutionOptions.builder().with_batch_interval_ms(2000)\
            .with_batch_size(100).with_max_retries(5).build()
        jdbc_sink = JdbcSink.sink("insert into test table", ds.get_type(), jdbc_connection_options,
                                  jdbc_execution_options)

        ds.add_sink(jdbc_sink).name('jdbc sink')
        plan = eval(self.env.get_execution_plan())
        self.assertEqual('Sink: jdbc sink', plan['nodes'][1]['type'])
        j_output_format = get_field_value(jdbc_sink.get_java_function(), 'outputFormat')

        connection_options = JdbcConnectionOptions(
            get_field_value(get_field_value(j_output_format, 'connectionProvider'),
                            'jdbcOptions'))
        self.assertEqual(jdbc_connection_options.get_db_url(), connection_options.get_db_url())
        self.assertEqual(jdbc_connection_options.get_driver_name(),
                         connection_options.get_driver_name())
        self.assertEqual(jdbc_connection_options.get_password(), connection_options.get_password())
        self.assertEqual(jdbc_connection_options.get_user_name(),
                         connection_options.get_user_name())

        exec_options = JdbcExecutionOptions(get_field_value(j_output_format, 'executionOptions'))
        self.assertEqual(jdbc_execution_options.get_batch_interval_ms(),
                         exec_options.get_batch_interval_ms())
        self.assertEqual(jdbc_execution_options.get_batch_size(),
                         exec_options.get_batch_size())
        self.assertEqual(jdbc_execution_options.get_max_retries(),
                         exec_options.get_max_retries())
Пример #3
0
    def test_from_java_type(self):
        basic_int_type_info = Types.INT()
        self.assertEqual(basic_int_type_info,
                         _from_java_type(basic_int_type_info.get_java_type_info()))

        basic_short_type_info = Types.SHORT()
        self.assertEqual(basic_short_type_info,
                         _from_java_type(basic_short_type_info.get_java_type_info()))

        basic_long_type_info = Types.LONG()
        self.assertEqual(basic_long_type_info,
                         _from_java_type(basic_long_type_info.get_java_type_info()))

        basic_float_type_info = Types.FLOAT()
        self.assertEqual(basic_float_type_info,
                         _from_java_type(basic_float_type_info.get_java_type_info()))

        basic_double_type_info = Types.DOUBLE()
        self.assertEqual(basic_double_type_info,
                         _from_java_type(basic_double_type_info.get_java_type_info()))

        basic_char_type_info = Types.CHAR()
        self.assertEqual(basic_char_type_info,
                         _from_java_type(basic_char_type_info.get_java_type_info()))

        basic_byte_type_info = Types.BYTE()
        self.assertEqual(basic_byte_type_info,
                         _from_java_type(basic_byte_type_info.get_java_type_info()))

        basic_big_int_type_info = Types.BIG_INT()
        self.assertEqual(basic_big_int_type_info,
                         _from_java_type(basic_big_int_type_info.get_java_type_info()))

        basic_big_dec_type_info = Types.BIG_DEC()
        self.assertEqual(basic_big_dec_type_info,
                         _from_java_type(basic_big_dec_type_info.get_java_type_info()))

        basic_sql_date_type_info = Types.SQL_DATE()
        self.assertEqual(basic_sql_date_type_info,
                         _from_java_type(basic_sql_date_type_info.get_java_type_info()))

        basic_sql_time_type_info = Types.SQL_TIME()
        self.assertEqual(basic_sql_time_type_info,
                         _from_java_type(basic_sql_time_type_info.get_java_type_info()))

        basic_sql_timestamp_type_info = Types.SQL_TIMESTAMP()
        self.assertEqual(basic_sql_timestamp_type_info,
                         _from_java_type(basic_sql_timestamp_type_info.get_java_type_info()))

        row_type_info = Types.ROW([Types.INT(), Types.STRING()])
        self.assertEqual(row_type_info, _from_java_type(row_type_info.get_java_type_info()))

        tuple_type_info = Types.TUPLE([Types.CHAR(), Types.INT()])
        self.assertEqual(tuple_type_info, _from_java_type(tuple_type_info.get_java_type_info()))

        primitive_int_array_type_info = Types.PRIMITIVE_ARRAY(Types.INT())
        self.assertEqual(primitive_int_array_type_info,
                         _from_java_type(primitive_int_array_type_info.get_java_type_info()))

        object_array_type_info = Types.OBJECT_ARRAY(Types.SQL_DATE())
        self.assertEqual(object_array_type_info,
                         _from_java_type(object_array_type_info.get_java_type_info()))

        pickled_byte_array_type_info = Types.PICKLED_BYTE_ARRAY()
        self.assertEqual(pickled_byte_array_type_info,
                         _from_java_type(pickled_byte_array_type_info.get_java_type_info()))

        sql_date_type_info = Types.SQL_DATE()
        self.assertEqual(sql_date_type_info,
                         _from_java_type(sql_date_type_info.get_java_type_info()))

        map_type_info = Types.MAP(Types.INT(), Types.STRING())
        self.assertEqual(map_type_info,
                         _from_java_type(map_type_info.get_java_type_info()))

        list_type_info = Types.LIST(Types.INT())
        self.assertEqual(list_type_info,
                         _from_java_type(list_type_info.get_java_type_info()))
    def test_generate_stream_graph_with_dependencies(self):
        python_file_dir = os.path.join(self.tempdir,
                                       "python_file_dir_" + str(uuid.uuid4()))
        os.mkdir(python_file_dir)
        python_file_path = os.path.join(
            python_file_dir, "test_stream_dependency_manage_lib.py")
        with open(python_file_path, 'w') as f:
            f.write("def add_two(a):\n    return a + 2")
        env = self.env
        env.add_python_file(python_file_path)

        def plus_two_map(value):
            from test_stream_dependency_manage_lib import add_two
            return value[0], add_two(value[1])

        def add_from_file(i):
            with open("data/data.txt", 'r') as f:
                return i[0], i[1] + int(f.read())

        from_collection_source = env.from_collection(
            [('a', 0), ('b', 0), ('c', 1), ('d', 1), ('e', 2)],
            type_info=Types.ROW([Types.STRING(), Types.INT()]))
        from_collection_source.name("From Collection")
        keyed_stream = from_collection_source.key_by(lambda x: x[1],
                                                     key_type=Types.INT())

        plus_two_map_stream = keyed_stream.map(plus_two_map).name(
            "Plus Two Map").set_parallelism(3)

        add_from_file_map = plus_two_map_stream.map(add_from_file).name(
            "Add From File Map")

        test_stream_sink = add_from_file_map.add_sink(
            self.test_sink).name("Test Sink")
        test_stream_sink.set_parallelism(4)

        archive_dir_path = os.path.join(self.tempdir,
                                        "archive_" + str(uuid.uuid4()))
        os.mkdir(archive_dir_path)
        with open(os.path.join(archive_dir_path, "data.txt"), 'w') as f:
            f.write("3")
        archive_file_path = \
            shutil.make_archive(os.path.dirname(archive_dir_path), 'zip', archive_dir_path)
        env.add_python_archive(archive_file_path, "data")

        nodes = eval(env.get_execution_plan())['nodes']

        # The StreamGraph should be as bellow:
        # Source: From Collection -> _stream_key_by_map_operator ->
        # Plus Two Map -> Add From File Map -> Sink: Test Sink.

        # Source: From Collection and _stream_key_by_map_operator should have same parallelism.
        self.assertEqual(nodes[0]['parallelism'], nodes[1]['parallelism'])

        # The parallelism of Plus Two Map should be 3
        self.assertEqual(nodes[2]['parallelism'], 3)

        # The ship_strategy for Source: From Collection and _stream_key_by_map_operator should be
        # FORWARD
        self.assertEqual(nodes[1]['predecessors'][0]['ship_strategy'],
                         "FORWARD")

        # The ship_strategy for _keyed_stream_values_operator and Plus Two Map should be
        # HASH
        self.assertEqual(nodes[2]['predecessors'][0]['ship_strategy'], "HASH")

        # The parallelism of Sink: Test Sink should be 4
        self.assertEqual(nodes[4]['parallelism'], 4)

        python_dependency_config = dict(
            get_gateway().jvm.org.apache.flink.python.util.
            PythonDependencyUtils.configurePythonDependencies(
                env._j_stream_execution_environment.getCachedFiles(),
                env._j_stream_execution_environment.getConfiguration()).toMap(
                ))

        # Make sure that user specified files and archives are correctly added.
        self.assertIsNotNone(
            python_dependency_config['python.internal.files-key-map'])
        self.assertIsNotNone(
            python_dependency_config['python.internal.archives-key-map'])
    def test_from_collection_with_data_types(self):
        # verify from_collection for the collection with single object.
        ds = self.env.from_collection(['Hi', 'Hello'],
                                      type_info=Types.STRING())
        ds.add_sink(self.test_sink)
        self.env.execute("test from collection with single object")
        results = self.test_sink.get_results(False)
        expected = ['Hello', 'Hi']
        results.sort()
        expected.sort()
        self.assertEqual(expected, results)

        # verify from_collection for the collection with multiple objects like tuple.
        ds = self.env.from_collection(
            [(1, None, 1, True, 32767, -2147483648, 1.23, 1.98932,
              bytearray(b'flink'), 'pyflink', datetime.date(2014, 9, 13),
              datetime.time(hour=12, minute=0, second=0, microsecond=123000),
              datetime.datetime(2018, 3, 11, 3, 0, 0, 123000), [1, 2, 3],
              decimal.Decimal('1000000000000000000.05'),
              decimal.Decimal('1000000000000000000.0599999999999'
                              '9999899999999999')),
             (2, None, 2, True, 43878, 9147483648, 9.87, 2.98936,
              bytearray(b'flink'), 'pyflink', datetime.date(2015, 10, 14),
              datetime.time(hour=11, minute=2, second=2, microsecond=234500),
              datetime.datetime(2020, 4, 15, 8, 2, 6, 235000), [2, 4, 6],
              decimal.Decimal('2000000000000000000.74'),
              decimal.Decimal('2000000000000000000.061111111111111'
                              '11111111111111'))],
            type_info=Types.ROW([
                Types.LONG(),
                Types.LONG(),
                Types.SHORT(),
                Types.BOOLEAN(),
                Types.SHORT(),
                Types.INT(),
                Types.FLOAT(),
                Types.DOUBLE(),
                Types.PICKLED_BYTE_ARRAY(),
                Types.STRING(),
                Types.SQL_DATE(),
                Types.SQL_TIME(),
                Types.SQL_TIMESTAMP(),
                Types.BASIC_ARRAY(Types.LONG()),
                Types.BIG_DEC(),
                Types.BIG_DEC()
            ]))
        ds.add_sink(self.test_sink)
        self.env.execute("test from collection with tuple object")
        results = self.test_sink.get_results(False)
        # if user specifies data types of input data, the collected result should be in row format.
        expected = [
            '+I[1, null, 1, true, 32767, -2147483648, 1.23, 1.98932, [102, 108, 105, 110, 107], '
            'pyflink, 2014-09-13, 12:00:00, 2018-03-11 03:00:00.123, [1, 2, 3], '
            '1000000000000000000.05, 1000000000000000000.05999999999999999899999999999]',
            '+I[2, null, 2, true, -21658, 557549056, 9.87, 2.98936, [102, 108, 105, 110, 107], '
            'pyflink, 2015-10-14, 11:02:02, 2020-04-15 08:02:06.235, [2, 4, 6], '
            '2000000000000000000.74, 2000000000000000000.06111111111111111111111111111]'
        ]
        results.sort()
        expected.sort()
        self.assertEqual(expected, results)
Пример #6
0
    def partition_custom(
            self, partitioner: Union[Callable, Partitioner],
            key_selector: Union[Callable, KeySelector]) -> 'DataStream':
        """
        Partitions a DataStream on the key returned by the selector, using a custom partitioner.
        This method takes the key selector to get the key to partition on, and a partitioner that
        accepts the key type.

        Note that this method works only on single field keys, i.e. the selector cannet return
        tuples of fields.

        :param partitioner: The partitioner to assign partitions to keys.
        :param key_selector: The KeySelector with which the DataStream is partitioned.
        :return: The partitioned DataStream.
        """
        if callable(key_selector):
            key_selector = KeySelectorFunctionWrapper(key_selector)
        if not isinstance(key_selector,
                          (KeySelector, KeySelectorFunctionWrapper)):
            raise TypeError(
                "Parameter key_selector should be a type of KeySelector.")

        if callable(partitioner):
            partitioner = PartitionerFunctionWrapper(partitioner)
        if not isinstance(partitioner,
                          (Partitioner, PartitionerFunctionWrapper)):
            raise TypeError(
                "Parameter partitioner should be a type of Partitioner.")

        gateway = get_gateway()
        data_stream_num_partitions_env_key = gateway.jvm\
            .org.apache.flink.datastream.runtime.operators.python\
            .DataStreamPythonPartitionCustomFunctionOperator.DATA_STREAM_NUM_PARTITIONS

        class PartitionCustomMapFunction(MapFunction):
            """
            A wrapper class for partition_custom map function. It indicates that it is a partition
            custom operation that we need to apply DataStreamPythonPartitionCustomFunctionOperator
            to run the map function.
            """
            def __init__(self):
                self.num_partitions = None

            def map(self, value):
                return self.partition_custom_map(value)

            def partition_custom_map(self, value):
                if self.num_partitions is None:
                    self.num_partitions = int(
                        os.environ[data_stream_num_partitions_env_key])
                partition = partitioner.partition(key_selector.get_key(value),
                                                  self.num_partitions)
                return partition, value

            def __repr__(self) -> str:
                return '_Flink_PartitionCustomMapFunction'

        original_type_info = self.get_type()
        intermediate_map_stream = self.map(
            PartitionCustomMapFunction(),
            type_info=Types.ROW([Types.INT(), original_type_info]))
        intermediate_map_stream.name(
            gateway.jvm.org.apache.flink.python.util.PythonConfigUtil.
            STREAM_PARTITION_CUSTOM_MAP_OPERATOR_NAME)

        JPartitionCustomKeySelector = gateway.jvm\
            .org.apache.flink.datastream.runtime.functions.python.PartitionCustomKeySelector
        JIdParitioner = gateway.jvm\
            .org.apache.flink.api.java.functions.IdPartitioner
        intermediate_map_stream = DataStream(
            intermediate_map_stream._j_data_stream.partitionCustom(
                JIdParitioner(), JPartitionCustomKeySelector()))

        values_map_stream = intermediate_map_stream.map(
            lambda x: x[1], original_type_info)
        values_map_stream.name(
            gateway.jvm.org.apache.flink.python.util.PythonConfigUtil.
            KEYED_STREAM_VALUE_OPERATOR_NAME)
        return DataStream(values_map_stream._j_data_stream)
Пример #7
0
    def kafka_connector_assertion(self, flink_kafka_consumer_clz,
                                  flink_kafka_producer_clz):
        source_topic = 'test_source_topic'
        sink_topic = 'test_sink_topic'
        props = {
            'bootstrap.servers': 'localhost:9092',
            'group.id': 'test_group'
        }
        type_info = Types.ROW([Types.INT(), Types.STRING()])

        # Test for kafka consumer
        deserialization_schema = JsonRowDeserializationSchema.builder() \
            .type_info(type_info=type_info).build()

        flink_kafka_consumer = flink_kafka_consumer_clz(
            source_topic, deserialization_schema, props)
        flink_kafka_consumer.set_start_from_earliest()
        flink_kafka_consumer.set_commit_offsets_on_checkpoints(True)

        j_properties = get_field_value(
            flink_kafka_consumer.get_java_function(), 'properties')
        self.assertEqual('localhost:9092',
                         j_properties.getProperty('bootstrap.servers'))
        self.assertEqual('test_group', j_properties.getProperty('group.id'))
        self.assertTrue(
            get_field_value(flink_kafka_consumer.get_java_function(),
                            'enableCommitOnCheckpoints'))
        j_start_up_mode = get_field_value(
            flink_kafka_consumer.get_java_function(), 'startupMode')

        j_deserializer = get_field_value(
            flink_kafka_consumer.get_java_function(), 'deserializer')
        j_deserialize_type_info = invoke_java_object_method(
            j_deserializer, "getProducedType")
        deserialize_type_info = typeinfo._from_java_type(
            j_deserialize_type_info)
        self.assertTrue(deserialize_type_info == type_info)
        self.assertTrue(
            j_start_up_mode.equals(
                get_gateway().jvm.org.apache.flink.streaming.connectors.kafka.
                config.StartupMode.EARLIEST))
        j_topic_desc = get_field_value(
            flink_kafka_consumer.get_java_function(), 'topicsDescriptor')
        j_topics = invoke_java_object_method(j_topic_desc, 'getFixedTopics')
        self.assertEqual(['test_source_topic'], list(j_topics))

        # Test for kafka producer
        serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \
            .build()
        flink_kafka_producer = flink_kafka_producer_clz(
            sink_topic, serialization_schema, props)
        flink_kafka_producer.set_write_timestamp_to_kafka(False)

        j_producer_config = get_field_value(
            flink_kafka_producer.get_java_function(), 'producerConfig')
        self.assertEqual('localhost:9092',
                         j_producer_config.getProperty('bootstrap.servers'))
        self.assertEqual('test_group',
                         j_producer_config.getProperty('group.id'))
        self.assertFalse(
            get_field_value(flink_kafka_producer.get_java_function(),
                            'writeTimestampToKafka'))
Пример #8
0
    def test_pulsar_sink(self):
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.ROW(
                                          [Types.STRING(),
                                           Types.INT()]))

        TEST_OPTION_NAME = 'pulsar.producer.chunkingEnabled'
        pulsar_sink = PulsarSink.builder() \
            .set_service_url('pulsar://localhost:6650') \
            .set_admin_url('http://localhost:8080') \
            .set_producer_name('fo') \
            .set_topics('ada') \
            .set_serialization_schema(
                PulsarSerializationSchema.flink_schema(SimpleStringSchema())) \
            .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \
            .set_topic_routing_mode(TopicRoutingMode.ROUND_ROBIN) \
            .delay_sending_message(MessageDelayer.fixed(Duration.of_seconds(12))) \
            .set_config(TEST_OPTION_NAME, True) \
            .set_properties({'pulsar.producer.batchingMaxMessages': '100'}) \
            .build()

        ds.sink_to(pulsar_sink).name('pulsar sink')

        plan = eval(self.env.get_execution_plan())
        self.assertEqual('pulsar sink: Writer', plan['nodes'][1]['type'])
        configuration = get_field_value(pulsar_sink.get_java_function(),
                                        "sinkConfiguration")
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.client.serviceUrl').string_type().
                no_default_value()._j_config_option),
            'pulsar://localhost:6650')
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.admin.adminUrl').string_type().
                no_default_value()._j_config_option), 'http://localhost:8080')
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.producer.producerName').string_type(
                ).no_default_value()._j_config_option), 'fo - %s')

        j_pulsar_serialization_schema = get_field_value(
            pulsar_sink.get_java_function(), 'serializationSchema')
        j_serialization_schema = get_field_value(j_pulsar_serialization_schema,
                                                 'serializationSchema')
        self.assertTrue(
            is_instance_of(
                j_serialization_schema,
                'org.apache.flink.api.common.serialization.SimpleStringSchema')
        )

        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.sink.deliveryGuarantee').string_type(
                ).no_default_value()._j_config_option), 'at-least-once')

        j_topic_router = get_field_value(pulsar_sink.get_java_function(),
                                         "topicRouter")
        self.assertTrue(
            is_instance_of(
                j_topic_router,
                'org.apache.flink.connector.pulsar.sink.writer.router.RoundRobinTopicRouter'
            ))

        j_message_delayer = get_field_value(pulsar_sink.get_java_function(),
                                            'messageDelayer')
        delay_duration = get_field_value(j_message_delayer, 'delayDuration')
        self.assertEqual(delay_duration, 12000)

        test_option = ConfigOptions.key(
            TEST_OPTION_NAME).boolean_type().no_default_value()
        self.assertEqual(
            configuration.getBoolean(test_option._j_config_option), True)
        self.assertEqual(
            configuration.getLong(
                ConfigOptions.key('pulsar.producer.batchingMaxMessages').
                long_type().no_default_value()._j_config_option), 100)
Пример #9
0
def demo01():
    # 创建一个执行环境,该环境表示程序当前正在执行。如果程序是独立调用的,则方法返回本地执行环境。
    # 1:创建一个流处理的执行环境,如果在本地启动则创建本地执行环境,如果在集群启动则创建集群执行环境
    env = StreamExecutionEnvironment.get_execution_environment()

    # 添加添加到程序的每个用户代码类加载器的类路径中的url列表。路径必须指定一个协议(例如file://),并且可以在所有节点上访问
    env.add_classpaths("file://lib")

    # 添加将被上传到集群并由作业引用的jar文件列表。 .set_string("pipeline.jars", 'file://' + dir_kafka_sql_connect)
    env.add_jars("file://jars")

    # 添加python存档文件。该文件将被解压到python UDF worker的工作目录中。
    # 目前只支持zip格式,例如zip、jar、whl、egg等
    # 会先解压zip -r py_env.zip py_env.zip
    env.add_python_archive("py_env.zip")
    # 如果python UDF依赖于集群中不存在的特定python版本,则可以使用此方法上传虚拟环境。注意,上传环境中包含的python解释器的路径应该通过该方法指定
    env.set_python_executable("py_env.zip/py_env/bin/python")
    # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3
    # or
    env.add_python_archive("py_env.zip", "myenv")
    env.set_python_executable("myenv/py_env/bin/python")
    # the files contained in the archive file can be accessed in UDF
    """
    def my_udf():
        with open("myenv/py_env/data/data.txt") as f:
            ...
    """
    # 相当于 pip download -d cached_dir -r requirements.txt --no-binary :all:
    env.set_python_requirements("requirements.txt", "cached_dir")
    # 添加一个python依赖项,它可以是python文件、python包或本地目录。它们将被添加到python UDF工作者的PYTHONPATH中。请确保可以导入这些依赖项。
    env.add_python_file("")

    # 添加source
    #1. add_source
    ds = env.add_source(
        FlinkKafkaConsumer(
            "source_topic",
            JsonRowDeserializationSchema.builder().type_info(
                type_info=Types.ROW([Types.INT(), Types.STRING()])).build(), {
                    'bootstrap.servers': 'localhost:9092',
                    'group.id': 'test_group'
                }))
    # 2. from_collection
    ds = env.from_collection([
        1,
        2,
        3,
    ], Types.INT())
    # 3. 从文件
    ds = env.read_text_file("hdfs://host:port/file/path")

    # 禁用operator chaining
    env.disable_operator_chaining()
    """
    Flink 可以非常高效的进行有状态流的计算,通过使用 Flink 内置的 Keyed State 和 Operator State,保存每个算子的状态。
    默认情况下,状态是存储在 JVM 的堆内存中,如果系统中某个环节发生了错误,宕机,这个时候所有的状态都会丢失,并且无法恢复,会导致整个系统的数据计算发生错误。
    此时就需要 Checkpoint 来保障系统的容错。Checkpoint 过程,就是把算子的状态周期性持久化的过程。
    在系统出错后恢复时,就可以从 checkpoint 中恢复每个算子的状态,从上次消费的地方重新开始消费和计算。从而可以做到在高效进行计算的同时还可以保证数据不丢失,只计算一次。
    
    最少一次
    AT_LEAST_ONCE
    如果假定是传输过程出现问题,而服务器没有收到数据,这样time out之后重传数据。但这可能是返回成功消息的时候出问题,而此时服务器已经收到数据,这样会因为重传而收到多份数据,这就是 at least once
    
    严格一次
    EXACTLY_ONCE
    
    最多一次(At-most-once)、最少一次(At-least-once),以及严格一次(Exactly-once)
    
    Checkpoint 必要的两个条件
    1. 需要支持重放一定时间范围内数据的数据源,比如:kafka 。
    因为容错机制就是在任务失败后自动从最近一次成功的 checkpoint 处恢复任务,此时需要把任务失败前消费的数据再消费一遍。
    假设数据源不支持重放,那么数据还未写到存储中就丢了,任务恢复后,就再也无法重新消费这部分丢了的数据了。
    
    2. 需要一个存储来保存持久化的状态,如:Hdfs,本地文件。可以在任务失败后,从存储中恢复 checkpoint 数据。
    
    https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/stream/state/checkpointing.html
    https://ci.apache.org/projects/flink/flink-docs-release-1.12/api/python/pyflink.datastream.html#pyflink.datastream.CheckpointConfig
    """
    # 每 300s 做一次 checkpoint
    env.enable_checkpointing(300000, CheckpointingMode.AT_LEAST_ONCE)
    # MemoryStateBackend FsStateBackend CustomStateBackend
    env.set_state_backend(RocksDBStateBackend("file://var/checkpoints/"))

    # set mode to exactly-once (this is the default)
    env.get_checkpoint_config().set_checkpointing_mode(
        CheckpointingMode.EXACTLY_ONCE)

    # 两次 checkpoint 的间隔时间至少为 500ms,默认是 0,立即进行下一次 checkpoint make sure 500 ms of progress happen between checkpoints
    env.get_checkpoint_config().set_min_pause_between_checkpoints(500)

    # checkpoint 必须在 60s 内结束,否则被丢弃 checkpoints have to complete within one minute, or are discarded
    env.get_checkpoint_config().set_checkpoint_timeout(60000)

    # 同一时间只能允许有一个 checkpoint allow only one checkpoint to be in progress at the same time
    env.get_checkpoint_config().set_max_concurrent_checkpoints(1)

    # 当 Flink 任务取消时,保留外部保存的 checkpoint 信息 enable externalized checkpoints which are retained after job cancellation
    env.get_checkpoint_config().enable_externalized_checkpoints(
        ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)

    # 当有较新的 Savepoint 时,作业也会从 Checkpoint 处恢复 allow job recovery fallback to checkpoint when there is a more recent savepoint
    env.get_checkpoint_config().set_prefer_checkpoint_for_recovery(True)

    # 允许实验性的功能:非对齐的 checkpoint,以提升性能 enables the  experimental  unaligned checkpoints
    #  CheckpointingMode.EXACTLY_ONCE时才能启用
    env.get_checkpoint_config().enable_unaligned_checkpoints()
    # env.get_checkpoint_config().disable_unaligned_checkpoints() 等同env.get_checkpoint_config().enable_unaligned_checkpoints(False)

    env.get_checkpoint_interval(
    )  #等同 env.get_checkpoint_config().get_checkpoint_interval()
    """
    """
    # https://ci.apache.org/projects/flink/flink-docs-release-1.12/api/python/pyflink.common.html#pyflink.common.ExecutionConfig
    # bin/flink run -Dexecution.runtime-mode=BATCH examples/streaming/WordCount.jar
    env.get_config().set_execution_mode(ExecutionMode.BATCH)

    env.get_config().disable_auto_generated_uids(
    )  # enable_auto_generated_uids
    # 自己设置uid
    ds.uid("xx")

    # 设置从此环境创建的所有流的时间特性,例如,处理时间、事件时间或摄取时间。
    # 如果将特征设置为EventTime的incertiontime,则将设置默认值水印更新间隔为200毫秒。
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)  #设置时间分配器
    env.get_config().set_auto_watermark_interval(200)  # 每200ms发出一个watermark

    env.get_config().set_global_job_parameters(
        {"environment.checkpoint_interval": "1000"})

    env.get_config().set_restart_strategy(
        RestartStrategies.fixed_delay_restart(10, 1000))

    # 执行
    env.execute("job name")
    # 异步执行
    jobClient = env.execute_async("job name")
    jobClient.get_job_execution_result().result()
    """
    设置输出缓冲区刷新的最大时间频率(毫秒)。默认情况下,输出缓冲区会频繁刷新,以提供较低的延迟,并帮助流畅的开发人员体验。设置该参数可以产生三种逻辑模式:
    正整数触发该整数周期性刷新
    0 触发每个记录之后的刷新,从而最大限度地减少延迟(最好不要设置为0 可以设置一个接近0的数值,比如5或者10)
    -1 仅在输出缓冲区已满时才触发刷新,从而最大化吞吐量
    """
    # 输出缓冲区刷新的最大时间频率(毫秒)
    env.get_buffer_timeout()
    env.set_buffer_timeout(10)

    # 获取执行计划的json,复制到https://flink.apache.org/visualizer/
    env.get_execution_plan()
Пример #10
0
def ds_operators():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_python_executable(
        r"D:/ProgramData/Anaconda3/envs/penter/python.exe")
    ds = s_env.from_collection(
        [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
        type_info=Types.ROW([Types.INT(),
                             Types.STRING(),
                             Types.STRING()]))
    """
    map
    flat_map
    filter
    key_by DataStream → KeyedStream
    reduce KeyedStream → DataStream
    union DataStream* → DataStream
    connect DataStream,DataStream → ConnectedStreams
    转换元组:
    project
    分区:
    partition_custom 自定义分区
    shuffle 随机分区 根据均匀分布随机划分元素。
    rebalance 轮询分区
    rescale 重新分区
    broadcast 向每个分区广播元素
    随意定制
    process 只有在KeyedStream上应用ProcessFunction时,才可以访问键控状态和计时器TimerService(相当于java的windows)。
    其它
    start_new_chain
    disable_chaining
    slot_sharing_group
    """
    ds.rescale()
    ds.map()
    ds.flat_map()
    ds.filter()
    # KeyBy DataStream → KeyedStream
    # Reduce KeyedStream → DataStream
    ds = s_env.from_collection([(1, 'a'), (2, 'a'), (3, 'a'), (4, 'b')],
                               type_info=Types.ROW(
                                   [Types.INT(), Types.STRING()]))
    ds.key_by(lambda a: a[1]) \
        .reduce(lambda a, b: Row(a[0] + b[0], b[1]))
    # 广播
    ds.broadcast()
    # project 只有元组ds才可以
    ds = s_env.from_collection([[1, 2, 3, 4], [5, 6, 7, 8]],
                               type_info=Types.TUPLE([
                                   Types.INT(),
                                   Types.INT(),
                                   Types.INT(),
                                   Types.INT()
                               ]))
    # 输出元组的1,3索引
    ds.project(1, 3).map(lambda x: (x[0], x[1] + 1)).add_sink()

    # 存储
    ds.add_sink(
        StreamingFileSink.for_row_format(
            '/tmp/output', SimpleStringEncoder()).with_rolling_policy(
                DefaultRollingPolicy.builder().with_rollover_interval(
                    15 * 60 * 1000).with_inactivity_interval(
                        5 * 60 * 1000).with_max_part_size(1024 * 1024 *
                                                          1024).build()).
        with_output_file_config(
            OutputFileConfig.OutputFileConfigBuilder().with_part_prefix(
                "prefix").with_part_suffix("suffix").build()).build())
    s_env.execute('ds_operators')
Пример #11
0
    def test_keyed_process_function_with_state(self):
        self.env.set_parallelism(1)
        self.env.get_config().set_auto_watermark_interval(2000)
        self.env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
        data_stream = self.env.from_collection(
            [(1, 'hi', '1603708211000'), (2, 'hello', '1603708224000'),
             (3, 'hi', '1603708226000'), (4, 'hello', '1603708289000'),
             (5, 'hi', '1603708291000'), (6, 'hello', '1603708293000')],
            type_info=Types.ROW([Types.INT(),
                                 Types.STRING(),
                                 Types.STRING()]))

        class MyTimestampAssigner(TimestampAssigner):
            def extract_timestamp(self, value, record_timestamp) -> int:
                return int(value[2])

        class MyProcessFunction(KeyedProcessFunction):
            def __init__(self):
                self.value_state = None
                self.list_state = None
                self.map_state = None

            def open(self, runtime_context: RuntimeContext):
                value_state_descriptor = ValueStateDescriptor(
                    'value_state', Types.INT())
                self.value_state = runtime_context.get_state(
                    value_state_descriptor)
                list_state_descriptor = ListStateDescriptor(
                    'list_state', Types.INT())
                self.list_state = runtime_context.get_list_state(
                    list_state_descriptor)
                map_state_descriptor = MapStateDescriptor(
                    'map_state', Types.INT(), Types.STRING())
                self.map_state = runtime_context.get_map_state(
                    map_state_descriptor)

            def process_element(self, value, ctx):
                current_value = self.value_state.value()
                self.value_state.update(value[0])
                current_list = [_ for _ in self.list_state.get()]
                self.list_state.add(value[0])
                map_entries_string = []
                for k, v in self.map_state.items():
                    map_entries_string.append(str(k) + ': ' + str(v))
                map_entries_string = '{' + ', '.join(map_entries_string) + '}'
                self.map_state.put(value[0], value[1])
                current_key = ctx.get_current_key()
                yield "current key: {}, current value state: {}, current list state: {}, " \
                      "current map state: {}, current value: {}".format(str(current_key),
                                                                        str(current_value),
                                                                        str(current_list),
                                                                        map_entries_string,
                                                                        str(value))

            def on_timer(self, timestamp, ctx):
                pass

        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(MyTimestampAssigner())
        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[1], key_type_info=Types.STRING()) \
            .process(MyProcessFunction(), output_type=Types.STRING()) \
            .add_sink(self.test_sink)
        self.env.execute(
            'test time stamp assigner with keyed process function')
        result = self.test_sink.get_results()
        expected_result = [
            "current key: hi, current value state: None, current list state: [], "
            "current map state: {}, current value: Row(f0=1, f1='hi', "
            "f2='1603708211000')",
            "current key: hello, current value state: None, "
            "current list state: [], current map state: {}, current value: Row(f0=2,"
            " f1='hello', f2='1603708224000')",
            "current key: hi, current value state: 1, current list state: [1], "
            "current map state: {1: hi}, current value: Row(f0=3, f1='hi', "
            "f2='1603708226000')",
            "current key: hello, current value state: 2, current list state: [2], "
            "current map state: {2: hello}, current value: Row(f0=4, f1='hello', "
            "f2='1603708289000')",
            "current key: hi, current value state: 3, current list state: [1, 3], "
            "current map state: {1: hi, 3: hi}, current value: Row(f0=5, f1='hi', "
            "f2='1603708291000')",
            "current key: hello, current value state: 4, current list state: [2, 4],"
            " current map state: {2: hello, 4: hello}, current value: Row(f0=6, "
            "f1='hello', f2='1603708293000')"
        ]
        result.sort()
        expected_result.sort()
        self.assertEqual(expected_result, result)