示例#1
0
    def test_file_sink(self):
        base_path = "/tmp/1.txt"
        encoder = Encoder.simple_string_encoder()
        file_sink_builder = FileSink.for_row_format(base_path, encoder)
        file_sink = file_sink_builder\
            .with_bucket_check_interval(1000) \
            .with_bucket_assigner(BucketAssigner.base_path_bucket_assigner()) \
            .with_rolling_policy(RollingPolicy.on_checkpoint_rolling_policy()) \
            .with_output_file_config(
                OutputFileConfig.builder().with_part_prefix("pre").with_part_suffix("suf").build())\
            .build()

        buckets_builder_field = \
            load_java_class("org.apache.flink.connector.file.sink.FileSink"). \
            getDeclaredField("bucketsBuilder")
        buckets_builder_field.setAccessible(True)
        buckets_builder = buckets_builder_field.get(
            file_sink.get_java_function())

        self.assertEqual("DefaultRowFormatBuilder",
                         buckets_builder.getClass().getSimpleName())

        row_format_builder_clz = load_java_class(
            "org.apache.flink.connector.file.sink.FileSink$RowFormatBuilder")
        encoder_field = row_format_builder_clz.getDeclaredField("encoder")
        encoder_field.setAccessible(True)
        self.assertEqual(
            "SimpleStringEncoder",
            encoder_field.get(buckets_builder).getClass().getSimpleName())

        interval_field = row_format_builder_clz.getDeclaredField(
            "bucketCheckInterval")
        interval_field.setAccessible(True)
        self.assertEqual(1000, interval_field.get(buckets_builder))

        bucket_assigner_field = row_format_builder_clz.getDeclaredField(
            "bucketAssigner")
        bucket_assigner_field.setAccessible(True)
        self.assertEqual(
            "BasePathBucketAssigner",
            bucket_assigner_field.get(
                buckets_builder).getClass().getSimpleName())

        rolling_policy_field = row_format_builder_clz.getDeclaredField(
            "rollingPolicy")
        rolling_policy_field.setAccessible(True)
        self.assertEqual(
            "OnCheckpointRollingPolicy",
            rolling_policy_field.get(
                buckets_builder).getClass().getSimpleName())

        output_file_config_field = row_format_builder_clz.getDeclaredField(
            "outputFileConfig")
        output_file_config_field.setAccessible(True)
        output_file_config = output_file_config_field.get(buckets_builder)
        self.assertEqual("pre", output_file_config.getPartPrefix())
        self.assertEqual("suf", output_file_config.getPartSuffix())
示例#2
0
def call(f: Union[str, UserDefinedFunctionWrapper], *args) -> Expression:
    """
    The first parameter `f` could be a str or a Python user-defined function.

    When it is str, this is a call to a function that will be looked up in a catalog. There
    are two kinds of functions:

        - System functions - which are identified with one part names
        - Catalog functions - which are identified always with three parts names
            (catalog, database, function)

    Moreover each function can either be a temporary function or permanent one
    (which is stored in an external catalog).

    Based on that two properties the resolution order for looking up a function based on
    the provided `function_name` is following:

        - Temporary system function
        - System function
        - Temporary catalog function
        - Catalog function

    :param f: the path of the function or the Python user-defined function.
    :param args: parameters of the user-defined function.
    """
    gateway = get_gateway()

    if isinstance(f, str):
        return Expression(
            gateway.jvm.Expressions.call(
                f,
                to_jarray(gateway.jvm.Object,
                          [_get_java_expression(arg) for arg in args])))

    expressions_clz = load_java_class("org.apache.flink.table.api.Expressions")
    function_definition_clz = load_java_class(
        'org.apache.flink.table.functions.FunctionDefinition')
    j_object_array_type = to_jarray(gateway.jvm.Object, []).getClass()

    api_call_method = expressions_clz.getDeclaredMethod(
        "apiCall",
        to_jarray(gateway.jvm.Class,
                  [function_definition_clz, j_object_array_type]))
    api_call_method.setAccessible(True)

    return Expression(
        api_call_method.invoke(
            None,
            to_jarray(gateway.jvm.Object, [
                f._java_user_defined_function(),
                to_jarray(gateway.jvm.Object,
                          [_get_java_expression(arg) for arg in args])
            ])))
    def add_default_kryo_serializer(self, type_class_name: str, serializer_class_name: str):
        """
        Adds a new Kryo default serializer to the Runtime.

        Example:
        ::

            >>> env.add_default_kryo_serializer("com.aaa.bbb.TypeClass", "com.aaa.bbb.Serializer")

        :param type_class_name: The full-qualified java class name of the types serialized with the
                                given serializer.
        :param serializer_class_name: The full-qualified java class name of the serializer to use.
        """
        type_clz = load_java_class(type_class_name)
        j_serializer_clz = load_java_class(serializer_class_name)
        self._j_stream_execution_environment.addDefaultKryoSerializer(type_clz, j_serializer_clz)
示例#4
0
    def test_file_source(self):
        stream_format = StreamFormat.text_line_format()
        paths = ["/tmp/1.txt", "/tmp/2.txt"]
        file_source_builder = FileSource.for_record_stream_format(
            stream_format, *paths)
        file_source = file_source_builder\
            .monitor_continuously(Duration.of_days(1)) \
            .set_file_enumerator(FileEnumeratorProvider.default_splittable_file_enumerator()) \
            .set_split_assigner(FileSplitAssignerProvider.locality_aware_split_assigner()) \
            .build()

        continuous_setting = file_source.get_java_function(
        ).getContinuousEnumerationSettings()
        self.assertIsNotNone(continuous_setting)
        self.assertEqual(Duration.of_days(1),
                         Duration(continuous_setting.getDiscoveryInterval()))

        input_paths_field = \
            load_java_class("org.apache.flink.connector.file.src.AbstractFileSource"). \
            getDeclaredField("inputPaths")
        input_paths_field.setAccessible(True)
        input_paths = input_paths_field.get(file_source.get_java_function())
        self.assertEqual(len(input_paths), len(paths))
        self.assertEqual(str(input_paths[0]), paths[0])
        self.assertEqual(str(input_paths[1]), paths[1])
示例#5
0
    def __init__(self,
                 record_class: str = None,
                 avro_schema_string: str = None):
        """
        Creates an Avro deserialization schema for the given specific record class or Avro schema
        string. Having the concrete Avro record class might improve performance.

        :param record_class: Avro record class used to deserialize Avro's record to Flink's row.
        :param avro_schema_string: Avro schema string to deserialize Avro's record to Flink's row.
        """

        if avro_schema_string is None and record_class is None:
            raise TypeError(
                "record_class or avro_schema_string should be specified.")
        j_deserialization_schema = None
        if record_class is not None:
            gateway = get_gateway()
            java_import(gateway.jvm, record_class)
            j_record_class = load_java_class(record_class)
            JAvroRowDeserializationSchema = get_gateway().jvm \
                .org.apache.flink.formats.avro.AvroRowDeserializationSchema
            j_deserialization_schema = JAvroRowDeserializationSchema(
                j_record_class)

        elif avro_schema_string is not None:
            JAvroRowDeserializationSchema = get_gateway().jvm \
                .org.apache.flink.formats.avro.AvroRowDeserializationSchema
            j_deserialization_schema = JAvroRowDeserializationSchema(
                avro_schema_string)

        super(AvroRowDeserializationSchema,
              self).__init__(j_deserialization_schema)
示例#6
0
    def __init__(self,
                 record_class: str = None,
                 avro_schema_string: str = None):
        """
        Creates AvroSerializationSchema that serializes SpecificRecord using provided schema or
        record class.

        :param record_class: Avro record class used to serialize  Flink's row to Avro's record.
        :param avro_schema_string: Avro schema string to serialize Flink's row to Avro's record.
        """
        if avro_schema_string is None and record_class is None:
            raise TypeError(
                "record_class or avro_schema_string should be specified.")

        j_serialization_schema = None
        if record_class is not None:
            gateway = get_gateway()
            java_import(gateway.jvm, record_class)
            j_record_class = load_java_class(record_class)
            JAvroRowSerializationSchema = get_gateway().jvm \
                .org.apache.flink.formats.avro.AvroRowSerializationSchema
            j_serialization_schema = JAvroRowSerializationSchema(
                j_record_class)

        elif avro_schema_string is not None:
            JAvroRowSerializationSchema = get_gateway().jvm \
                .org.apache.flink.formats.avro.AvroRowSerializationSchema
            j_serialization_schema = JAvroRowSerializationSchema(
                avro_schema_string)

        super(AvroRowSerializationSchema,
              self).__init__(j_serialization_schema)
示例#7
0
 def set_topic_router(self, topic_router_class_name: str) -> 'PulsarSinkBuilder':
     """
     Use a custom topic router instead predefine topic routing.
     """
     j_topic_router = load_java_class(topic_router_class_name).newInstance()
     self._j_pulsar_sink_builder.setTopicRouter(j_topic_router)
     return self
示例#8
0
 def with_bucket_assigner(
         self,
         assigner_class_name: str) -> 'StreamingFileSink.DefaultRowFormatBuilder':
     gateway = get_gateway()
     java_import(gateway.jvm, assigner_class_name)
     j_record_class = load_java_class(assigner_class_name)
     self.j_default_row_format_builder.withBucketAssigner(j_record_class)
     return self
    def register_type_with_kryo_serializer(self, type_class_name: str, serializer_class_name: str):
        """
        Registers the given Serializer via its class as a serializer for the given type at the
        KryoSerializer.

        Example:
        ::

            >>> env.register_type_with_kryo_serializer("com.aaa.bbb.TypeClass",
            ...                                        "com.aaa.bbb.Serializer")

        :param type_class_name: The full-qualified java class name of the types serialized with
                                the given serializer.
        :param serializer_class_name: The full-qualified java class name of the serializer to use.
        """
        type_clz = load_java_class(type_class_name)
        j_serializer_clz = load_java_class(serializer_class_name)
        self._j_stream_execution_environment.registerTypeWithKryoSerializer(
            type_clz, j_serializer_clz)
示例#10
0
    def test_create_custom_state_backend(self):
        gateway = get_gateway()
        JConfiguration = gateway.jvm.org.apache.flink.configuration.Configuration
        j_config = JConfiguration()
        j_factory = load_java_class("org.apache.flink.streaming.runtime.tasks."
                                    "StreamTaskTest$TestMemoryStateBackendFactory").newInstance()
        context_classloader = gateway.jvm.Thread.currentThread().getContextClassLoader()
        state_backend = _from_j_state_backend(j_factory.createFromConfig(j_config,
                                                                         context_classloader))

        self.assertIsInstance(state_backend, CustomStateBackend)
    def add_default_kryo_serializer(
            self, type_class_name: str,
            serializer_class_name: str) -> 'ExecutionConfig':
        """
        Adds a new Kryo default serializer to the Runtime.

        Example:
        ::

            >>> config.add_default_kryo_serializer("com.aaa.bbb.PojoClass",
            ...                                    "com.aaa.bbb.Serializer")

        :param type_class_name: The full-qualified java class name of the types serialized with the
                                given serializer.
        :param serializer_class_name: The full-qualified java class name of the serializer to use.
        """
        type_clz = load_java_class(type_class_name)
        j_serializer_clz = load_java_class(serializer_class_name)
        self._j_execution_config.addDefaultKryoSerializer(
            type_clz, j_serializer_clz)
        return self
示例#12
0
    def test_seq_source(self):
        seq_source = NumberSequenceSource(1, 10)

        seq_source_clz = load_java_class(
            "org.apache.flink.api.connector.source.lib.NumberSequenceSource")
        from_field = seq_source_clz.getDeclaredField("from")
        from_field.setAccessible(True)
        self.assertEqual(1, from_field.get(seq_source.get_java_function()))

        to_field = seq_source_clz.getDeclaredField("to")
        to_field.setAccessible(True)
        self.assertEqual(10, to_field.get(seq_source.get_java_function()))
示例#13
0
def add_jars_to_context_class_loader(jar_urls):
    """
    Add jars to Python gateway server for local compilation and local execution (i.e. minicluster).
    There are many component in Flink which won't be added to classpath by default. e.g. Kafka
    connector, JDBC connector, CSV format etc. This utility function can be used to hot load the
    jars.

    :param jar_urls: The list of jar urls.
    """
    gateway = get_gateway()
    # validate and normalize
    jar_urls = [gateway.jvm.java.net.URL(url) for url in jar_urls]
    context_classloader = gateway.jvm.Thread.currentThread().getContextClassLoader()
    existing_urls = []
    class_loader_name = context_classloader.getClass().getName()
    if class_loader_name == "java.net.URLClassLoader":
        existing_urls = set([url.toString() for url in context_classloader.getURLs()])
    if all([url.toString() in existing_urls for url in jar_urls]):
        # if urls all existed, no need to create new class loader.
        return
    URLClassLoaderClass = load_java_class("java.net.URLClassLoader")
    addURL = URLClassLoaderClass.getDeclaredMethod(
        "addURL",
        to_jarray(
            gateway.jvm.Class,
            [load_java_class("java.net.URL")]))
    addURL.setAccessible(True)
    if class_loader_name == "org.apache.flink.runtime.execution.librarycache." \
                            "FlinkUserCodeClassLoaders$SafetyNetWrapperClassLoader":
        ensureInner = context_classloader.getClass().getDeclaredMethod("ensureInner", None)
        ensureInner.setAccessible(True)
        loader = ensureInner.invoke(context_classloader, None)
    else:
        loader = context_classloader
    for url in jar_urls:
        addURL.invoke(loader, to_jarray(get_gateway().jvm.Object, [url]))
    def register_type(self, type_class_name: str):
        """
        Registers the given type with the serialization stack. If the type is eventually
        serialized as a POJO, then the type is registered with the POJO serializer. If the
        type ends up being serialized with Kryo, then it will be registered at Kryo to make
        sure that only tags are written.

        Example:
        ::

            >>> env.register_type("com.aaa.bbb.TypeClass")

        :param type_class_name: The full-qualified java class name of the type to register.
        """
        type_clz = load_java_class(type_class_name)
        self._j_stream_execution_environment.registerType(type_clz)
    def register_kryo_type(self, type_class_name: str) -> 'ExecutionConfig':
        """
        Registers the given type with the serialization stack. If the type is eventually
        serialized as a POJO, then the type is registered with the POJO serializer. If the
        type ends up being serialized with Kryo, then it will be registered at Kryo to make
        sure that only tags are written.

        Example:
        ::

            >>> config.register_kryo_type("com.aaa.bbb.KryoClass")

        :param type_class_name: The full-qualified java class name of the type to register.
        """
        type_clz = load_java_class(type_class_name)
        self._j_execution_config.registerKryoType(type_clz)
        return self
示例#16
0
    def set_options(self, options_factory_class_name: str):
        """
        Sets ``org.rocksdb.Options`` for the RocksDB instances.
        Because the options are not serializable and hold native code references,
        they must be specified through a factory.

        The options created by the factory here are applied on top of the pre-defined
        options profile selected via :func:`set_predefined_options`.
        If the pre-defined options profile is the default (:data:`PredefinedOptions.DEFAULT`),
        then the factory fully controls the RocksDB options.

        :param options_factory_class_name: The fully-qualified class name of the options
                                           factory in Java that lazily creates the RocksDB options.
                                           The options factory must have a default constructor.
        """
        gateway = get_gateway()
        JOptionsFactory = gateway.jvm.org.apache.flink.contrib.streaming.state.RocksDBOptionsFactory
        j_options_factory_clz = load_java_class(options_factory_class_name)
        if not get_java_class(JOptionsFactory).isAssignableFrom(
                j_options_factory_clz):
            raise ValueError(
                "The input class does not implement RocksDBOptionsFactory.")
        self._j_rocks_db_state_backend.setRocksDBOptions(
            j_options_factory_clz.newInstance())
示例#17
0
    def test_file_sink(self):
        base_path = "/tmp/1.txt"
        encoder = Encoder.simple_string_encoder()
        file_sink_builder = FileSink.for_row_format(base_path, encoder)
        file_sink = file_sink_builder\
            .with_bucket_check_interval(1000) \
            .with_bucket_assigner(BucketAssigner.base_path_bucket_assigner()) \
            .with_rolling_policy(RollingPolicy.on_checkpoint_rolling_policy()) \
            .with_output_file_config(
                OutputFileConfig.builder().with_part_prefix("pre").with_part_suffix("suf").build())\
            .enable_compact(FileCompactStrategy.builder()
                            .enable_compaction_on_checkpoint(3)
                            .set_size_threshold(1024)
                            .set_num_compact_threads(2)
                            .build(),
                            FileCompactor.concat_file_compactor(b'\n')) \
            .build()

        buckets_builder_field = \
            load_java_class("org.apache.flink.connector.file.sink.FileSink"). \
            getDeclaredField("bucketsBuilder")
        buckets_builder_field.setAccessible(True)
        buckets_builder = buckets_builder_field.get(
            file_sink.get_java_function())

        self.assertEqual("DefaultRowFormatBuilder",
                         buckets_builder.getClass().getSimpleName())

        row_format_builder_clz = load_java_class(
            "org.apache.flink.connector.file.sink.FileSink$RowFormatBuilder")
        encoder_field = row_format_builder_clz.getDeclaredField("encoder")
        encoder_field.setAccessible(True)
        self.assertEqual(
            "SimpleStringEncoder",
            encoder_field.get(buckets_builder).getClass().getSimpleName())

        interval_field = row_format_builder_clz.getDeclaredField(
            "bucketCheckInterval")
        interval_field.setAccessible(True)
        self.assertEqual(1000, interval_field.get(buckets_builder))

        bucket_assigner_field = row_format_builder_clz.getDeclaredField(
            "bucketAssigner")
        bucket_assigner_field.setAccessible(True)
        self.assertEqual(
            "BasePathBucketAssigner",
            bucket_assigner_field.get(
                buckets_builder).getClass().getSimpleName())

        rolling_policy_field = row_format_builder_clz.getDeclaredField(
            "rollingPolicy")
        rolling_policy_field.setAccessible(True)
        self.assertEqual(
            "OnCheckpointRollingPolicy",
            rolling_policy_field.get(
                buckets_builder).getClass().getSimpleName())

        output_file_config_field = row_format_builder_clz.getDeclaredField(
            "outputFileConfig")
        output_file_config_field.setAccessible(True)
        output_file_config = output_file_config_field.get(buckets_builder)
        self.assertEqual("pre", output_file_config.getPartPrefix())
        self.assertEqual("suf", output_file_config.getPartSuffix())

        compact_strategy_field = row_format_builder_clz.getDeclaredField(
            "compactStrategy")
        compact_strategy_field.setAccessible(True)
        compact_strategy = compact_strategy_field.get(buckets_builder)
        self.assertEqual(3,
                         compact_strategy.getNumCheckpointsBeforeCompaction())
        self.assertEqual(1024, compact_strategy.getSizeThreshold())
        self.assertEqual(2, compact_strategy.getNumCompactThreads())

        file_compactor_field = row_format_builder_clz.getDeclaredField(
            "fileCompactor")
        file_compactor_field.setAccessible(True)
        file_compactor = file_compactor_field.get(buckets_builder)
        self.assertEqual("ConcatFileCompactor",
                         file_compactor.getClass().getSimpleName())
        concat_file_compactor_clz = load_java_class(
            "org.apache.flink.connector.file.sink.compactor.ConcatFileCompactor"
        )
        file_delimiter_field = concat_file_compactor_clz.getDeclaredField(
            "fileDelimiter")
        file_delimiter_field.setAccessible(True)
        file_delimiter = file_delimiter_field.get(file_compactor)
        self.assertEqual(b'\n', file_delimiter)