예제 #1
0
def _from_j_state_backend(j_state_backend):
    if j_state_backend is None:
        return None
    gateway = get_gateway()
    JStateBackend = gateway.jvm.org.apache.flink.runtime.state.StateBackend
    JMemoryStateBackend = gateway.jvm.org.apache.flink.runtime.state.memory.MemoryStateBackend
    JFsStateBackend = gateway.jvm.org.apache.flink.runtime.state.filesystem.FsStateBackend
    JRocksDBStateBackend = gateway.jvm.org.apache.flink.contrib.streaming.state.RocksDBStateBackend
    j_clz = j_state_backend.getClass()

    if not get_java_class(JStateBackend).isAssignableFrom(j_clz):
        raise TypeError("The input %s is not an instance of StateBackend." %
                        j_state_backend)

    if get_java_class(JMemoryStateBackend).isAssignableFrom(
            j_state_backend.getClass()):
        return MemoryStateBackend(j_memory_state_backend=j_state_backend)
    elif get_java_class(JFsStateBackend).isAssignableFrom(
            j_state_backend.getClass()):
        return FsStateBackend(j_fs_state_backend=j_state_backend)
    elif get_java_class(JRocksDBStateBackend).isAssignableFrom(
            j_state_backend.getClass()):
        return RocksDBStateBackend(j_rocks_db_state_backend=j_state_backend)
    else:
        return CustomStateBackend(
            j_state_backend)  # users' customized state backend
예제 #2
0
 def _from_j_restart_strategy(j_restart_strategy):
     if j_restart_strategy is None:
         return None
     gateway = get_gateway()
     NoRestartStrategyConfiguration = gateway.jvm.RestartStrategies\
         .NoRestartStrategyConfiguration
     FixedDelayRestartStrategyConfiguration = gateway.jvm.RestartStrategies\
         .FixedDelayRestartStrategyConfiguration
     FailureRateRestartStrategyConfiguration = gateway.jvm.RestartStrategies\
         .FailureRateRestartStrategyConfiguration
     FallbackRestartStrategyConfiguration = gateway.jvm.RestartStrategies\
         .FallbackRestartStrategyConfiguration
     clz = j_restart_strategy.getClass()
     if clz.getName() == get_java_class(NoRestartStrategyConfiguration).getName():
         return RestartStrategies.NoRestartStrategyConfiguration(
             j_restart_strategy=j_restart_strategy)
     elif clz.getName() == get_java_class(FixedDelayRestartStrategyConfiguration).getName():
         return RestartStrategies.FixedDelayRestartStrategyConfiguration(
             j_restart_strategy=j_restart_strategy)
     elif clz.getName() == get_java_class(FailureRateRestartStrategyConfiguration).getName():
         return RestartStrategies.FailureRateRestartStrategyConfiguration(
             j_restart_strategy=j_restart_strategy)
     elif clz.getName() == get_java_class(FallbackRestartStrategyConfiguration).getName():
         return RestartStrategies.FallbackRestartStrategyConfiguration(
             j_restart_strategy=j_restart_strategy)
     else:
         raise Exception("Unsupported java RestartStrategyConfiguration: %s" % clz.getName())
예제 #3
0
 def for_schema(schema: 'CsvSchema') -> 'CsvReaderFormat':
     """
     Builds a :class:`CsvReaderFormat` using `CsvSchema`.
     """
     jvm = get_gateway().jvm
     jackson = jvm.org.apache.flink.shaded.jackson2.com.fasterxml.jackson
     constructor = get_java_class(jvm.org.apache.flink.formats.csv.CsvReaderFormat) \
         .getDeclaredConstructor(
         to_jarray(jvm.Class, [
             get_java_class(jackson.dataformat.csv.CsvMapper),
             get_java_class(jackson.dataformat.csv.CsvSchema),
             get_java_class(jvm.Class),
             get_java_class(jvm.org.apache.flink.formats.common.Converter),
             get_java_class(jvm.org.apache.flink.api.common.typeinfo.TypeInformation),
             get_java_class(jvm.boolean)
         ])
     )
     constructor.setAccessible(True)
     j_csv_format = constructor.newInstance(
         to_jarray(jvm.Object, [
             jackson.dataformat.csv.CsvMapper(), schema._j_schema,
             get_java_class(jackson.databind.JsonNode),
             jvm.org.apache.flink.formats.csv.CsvToRowDataConverters(
                 False).createRowConverter(
                     _to_java_data_type(schema._data_type).getLogicalType(),
                     True),
             jvm.org.apache.flink.table.runtime.typeutils.InternalTypeInfo.
             of(_to_java_data_type(
                 schema._data_type).getLogicalType()), False
         ]))
     return CsvReaderFormat(j_csv_format)
예제 #4
0
파일: rdd.py 프로젝트: aim11/adam
    def pipe(self,
             cmd,
             tFormatter,
             xFormatter,
             convFn,
             files=None,
             environment=None,
             flankSize=0):
        """
        Pipes genomic data to a subprocess that runs in parallel using Spark.
        
        Files are substituted in to the command with a $x syntax. E.g., to invoke
        a command that uses the first file from the files Seq, use $0. To access
        the path to the directory where the files are copied, use $root.
        
        Pipes require the presence of an InFormatterCompanion and an OutFormatter
        as implicit values. The InFormatterCompanion should be a singleton whose
        apply method builds an InFormatter given a specific type of GenomicRDD.
        The implicit InFormatterCompanion yields an InFormatter which is used to
        format the input to the pipe, and the implicit OutFormatter is used to
        parse the output from the pipe.

        :param list cmd: The command to run.
        :param str tFormatter: The name of the ADAM in-formatter class to use.
        :param str xFormatter: The name of the ADAM out-formatter class to use.
        :param str convFn: The name of the ADAM GenomicRDD conversion class to
        use.
        :param list files: The files to copy locally onto all executors. Set to
        None (default) to omit.
        :param dict environment: The environment variables to set on the
        executor. Set to None (default) to omit.
        :param int flankSize: The number of bases of flanking sequence to have
        around each partition. Defaults to 0.
        :return: Returns a new RDD where the input from the original RDD has
        been piped through a command that runs locally on each executor.
        """

        jvm = self.sc._jvm

        tFormatterClass = get_java_class(getattr(jvm, tFormatter))
        
        xFormatterInst = getattr(jvm, xFormatter)()

        convFnInst = getattr(jvm, convFn)()
        
        if files is None:
            files = []

        if environment is None:
            environment = {}

        return self._replaceRdd(self._jvmRdd.pipe(cmd,
                                                  files,
                                                  environment,
                                                  flankSize,
                                                  tFormatterClass,
                                                  xFormatterInst,
                                                  convFnInst))
예제 #5
0
    def testGetJavaClass(self):
        ArrayList = self.gateway.jvm.java.util.ArrayList
        clazz1 = ArrayList._java_lang_class
        clazz2 = get_java_class(ArrayList)

        self.assertEqual("java.util.ArrayList", clazz1.getName())
        self.assertEqual("java.util.ArrayList", clazz2.getName())
        self.assertEqual("java.lang.Class", clazz1.getClass().getName())
        self.assertEqual("java.lang.Class", clazz2.getClass().getName())
예제 #6
0
    def testGetJavaClass(self):
        ArrayList = self.gateway.jvm.java.util.ArrayList
        clazz1 = ArrayList._java_lang_class
        clazz2 = get_java_class(ArrayList)

        self.assertEqual("java.util.ArrayList", clazz1.getName())
        self.assertEqual("java.util.ArrayList", clazz2.getName())
        self.assertEqual("java.lang.Class", clazz1.getClass().getName())
        self.assertEqual("java.lang.Class", clazz2.getClass().getName())
예제 #7
0
 def __init__(self, path: str, schema: 'AvroSchema'):
     """
     :param path: The path to Avro data file.
     :param schema: The :class:`AvroSchema` of generic record.
     """
     jvm = get_gateway().jvm
     j_avro_input_format = jvm.org.apache.flink.formats.avro.AvroInputFormat(
         jvm.org.apache.flink.core.fs.Path(path),
         get_java_class(jvm.org.apache.flink.avro.shaded.org.apache.avro.
                        generic.GenericRecord))
     super().__init__(j_avro_input_format)
     self._type_info = GenericRecordAvroTypeInfo(schema)
예제 #8
0
    def _get_execution_config(self, filename, schema):
        gateway = get_gateway()
        blink_t_env_class = get_java_class(gateway.jvm.org.apache.flink.table.
                                           api.internal.TableEnvironmentImpl)
        is_blink = (blink_t_env_class == self._j_tenv.getClass())
        if is_blink:
            # we can not get ExecutionConfig object from the TableEnvironmentImpl
            # for the moment, just create a new ExecutionConfig.
            execution_config = gateway.jvm.org.apache.flink.api.common.ExecutionConfig(
            )
        else:
            execution_config = self._j_tenv.execEnv().getConfig()

        return execution_config
예제 #9
0
 def _from_file(self, filename, schema):
     gateway = get_gateway()
     blink_t_env_class = get_java_class(gateway.jvm.org.apache.flink.table.
                                        api.internal.TableEnvironmentImpl)
     if blink_t_env_class == self._j_tenv.getClass():
         raise NotImplementedError(
             "The operation 'from_elements' in batch mode is currently "
             "not supported when using blink planner.")
     else:
         jds = gateway.jvm.PythonBridgeUtils.createDataSetFromFile(
             self._j_tenv.execEnv(), filename, True)
         return Table(
             gateway.jvm.PythonTableUtils.fromDataSet(
                 self._j_tenv, jds, _to_java_type(schema)))
예제 #10
0
def _from_j_checkpoint_storage(j_checkpoint_storage):
    if j_checkpoint_storage is None:
        return None
    gateway = get_gateway()
    JCheckpointStorage = gateway.jvm.org.apache.flink.runtime.state.CheckpointStorage
    JJobManagerCheckpointStorage = gateway.jvm.org.apache.flink.runtime.state.storage \
        .JobManagerCheckpointStorage
    JFileSystemCheckpointStorage = gateway.jvm.org.apache.flink.runtime.state.storage \
        .FileSystemCheckpointStorage

    j_clz = j_checkpoint_storage.getClass()

    if not get_java_class(JCheckpointStorage).isAssignableFrom(j_clz):
        raise TypeError("%s is not an instance of CheckpointStorage." %
                        j_checkpoint_storage)

    if get_java_class(JJobManagerCheckpointStorage).isAssignableFrom(j_clz):
        return JobManagerCheckpointStorage(
            j_jobmanager_checkpoint_storage=j_checkpoint_storage)
    elif get_java_class(JFileSystemCheckpointStorage).isAssignableFrom(j_clz):
        return FileSystemCheckpointStorage(
            j_filesystem_checkpoint_storage=j_checkpoint_storage)
    else:
        return CustomCheckpointStorage(j_checkpoint_storage)
예제 #11
0
파일: java_utils.py 프로젝트: zhaoawd/flink
def is_instance_of(java_object, java_class):
    gateway = get_gateway()
    if isinstance(java_class, str):
        param = java_class
    elif isinstance(java_class, JavaClass):
        param = get_java_class(java_class)
    elif isinstance(java_class, JavaObject):
        if not is_instance_of(java_class, gateway.jvm.Class):
            param = java_class.getClass()
        else:
            param = java_class
    else:
        raise TypeError(
            "java_class must be a string, a JavaClass, or a JavaObject")

    return gateway.jvm.org.apache.flink.api.python.shaded.py4j.reflection.TypeUtil.isInstanceOf(
        param, java_object)
예제 #12
0
    def connect(self, connector_descriptor):
        """
        Creates a table source and/or table sink from a descriptor.

        Descriptors allow for declaring the communication to external systems in an
        implementation-agnostic way. The classpath is scanned for suitable table factories that
        match the desired configuration.

        The following example shows how to read from a connector using a JSON format and
        registering a table source as "MyTable":
        ::

            >>> table_env \\
            ...     .connect(ExternalSystemXYZ()
            ...              .version("0.11")) \\
            ...     .with_format(Json()
            ...                  .json_schema("{...}")
            ...                  .fail_on_missing_field(False)) \\
            ...     .with_schema(Schema()
            ...                  .field("user-name", "VARCHAR")
            ...                  .from_origin_field("u_name")
            ...                  .field("count", "DECIMAL")) \\
            ...     .register_table_source("MyTable")

        :param connector_descriptor: Connector descriptor describing the external system.
        :type connector_descriptor: ConnectorDescriptor
        :return: A :class:`BatchTableDescriptor` or a :class:`StreamTableDescriptor`
                 (for blink planner) used to build the table source/sink.
        :rtype: BatchTableDescriptor or StreamTableDescriptor
        """
        gateway = get_gateway()
        blink_t_env_class = get_java_class(gateway.jvm.org.apache.flink.table.
                                           api.internal.TableEnvironmentImpl)
        if blink_t_env_class == self._j_tenv.getClass():
            return StreamTableDescriptor(
                self._j_tenv.connect(
                    connector_descriptor._j_connector_descriptor))
        else:
            return BatchTableDescriptor(
                self._j_tenv.connect(
                    connector_descriptor._j_connector_descriptor))
예제 #13
0
    def set_options(self, options_factory_class_name: str):
        """
        Sets ``org.rocksdb.Options`` for the RocksDB instances.
        Because the options are not serializable and hold native code references,
        they must be specified through a factory.

        The options created by the factory here are applied on top of the pre-defined
        options profile selected via :func:`set_predefined_options`.
        If the pre-defined options profile is the default (:data:`PredefinedOptions.DEFAULT`),
        then the factory fully controls the RocksDB options.

        :param options_factory_class_name: The fully-qualified class name of the options
                                           factory in Java that lazily creates the RocksDB options.
                                           The options factory must have a default constructor.
        """
        gateway = get_gateway()
        JOptionsFactory = gateway.jvm.org.apache.flink.contrib.streaming.state.RocksDBOptionsFactory
        j_options_factory_clz = load_java_class(options_factory_class_name)
        if not get_java_class(JOptionsFactory).isAssignableFrom(j_options_factory_clz):
            raise ValueError("The input class does not implement RocksDBOptionsFactory.")
        self._j_rocks_db_state_backend.setRocksDBOptions(j_options_factory_clz.newInstance())
예제 #14
0
    def offsets(
        offsets: Dict['KafkaTopicPartition', int],
        offset_reset_strategy:
        'KafkaOffsetResetStrategy' = KafkaOffsetResetStrategy.EARLIEST
    ) -> 'KafkaOffsetsInitializer':
        """
        Get an :class:`KafkaOffsetsInitializer` which initializes the offsets to the specified
        offsets.

        An optional :class:`KafkaOffsetResetStrategy` can be specified to initialize the offsets in
        case the specified offset is out of range.

        Example:
        ::

            >>> KafkaOffsetsInitializer.offsets({
            ...     KafkaTopicPartition('TOPIC1', 0): 0,
            ...     KafkaTopicPartition('TOPIC1', 1): 10000
            ... }, KafkaOffsetResetStrategy.EARLIEST)

        :param offsets: the specified offsets for each partition.
        :param offset_reset_strategy: the :class:`KafkaOffsetResetStrategy` to use when the
            specified offset is out of range.
        :return: an :class:`KafkaOffsetsInitializer` which initializes the offsets to the specified
            offsets.
        """
        jvm = get_gateway().jvm
        j_map_wrapper = jvm.org.apache.flink.python.util.HashMapWrapper(
            None, get_java_class(jvm.Long))
        for tp, offset in offsets.items():
            j_map_wrapper.put(tp._to_j_topic_partition(), offset)

        JOffsetsInitializer = get_gateway().jvm.org.apache.flink.connector.kafka.source. \
            enumerator.initializer.OffsetsInitializer
        return KafkaOffsetsInitializer(
            JOffsetsInitializer.offsets(
                j_map_wrapper.asMap(),
                offset_reset_strategy._to_j_offset_reset_strategy()))
예제 #15
0
    def create(execution_environment=None,
               table_config=None,
               environment_settings=None):
        """
        Creates a :class:`BatchTableEnvironment`.

        Example:
        ::

            # create with ExecutionEnvironment.
            >>> env = ExecutionEnvironment.get_execution_environment()
            >>> table_env = BatchTableEnvironment.create(env)
            # create with ExecutionEnvironment and TableConfig.
            >>> table_config = TableConfig()
            >>> table_config.set_null_check(False)
            >>> table_env = BatchTableEnvironment.create(env, table_config)
            # create with EnvironmentSettings.
            >>> environment_settings = EnvironmentSettings.new_instance().in_batch_mode() \\
            ...     .use_blink_planner().build()
            >>> table_env = BatchTableEnvironment.create(environment_settings=environment_settings)

        :param execution_environment: The batch :class:`pyflink.dataset.ExecutionEnvironment` of
                                      the TableEnvironment.
        :type execution_environment: pyflink.dataset.ExecutionEnvironment
        :param table_config: The configuration of the TableEnvironment, optional.
        :type table_config: TableConfig
        :param environment_settings: The environment settings used to instantiate the
                                     TableEnvironment. It provides the interfaces about planner
                                     selection(flink or blink), optional.
        :type environment_settings: pyflink.table.EnvironmentSettings
        :return: The BatchTableEnvironment created from given ExecutionEnvironment and
                 configuration.
        :rtype: BatchTableEnvironment
        """
        if execution_environment is None and \
                table_config is None and \
                environment_settings is None:
            raise ValueError(
                "No argument found, the param 'execution_environment' "
                "or 'environment_settings' is required.")
        elif execution_environment is None and \
                table_config is not None and \
                environment_settings is None:
            raise ValueError(
                "Only the param 'table_config' is found, "
                "the param 'execution_environment' is also required.")
        elif execution_environment is not None and \
                environment_settings is not None:
            raise ValueError(
                "The param 'execution_environment' and "
                "'environment_settings' cannot be used at the same time")
        elif table_config is not None and \
                environment_settings is not None:
            raise ValueError(
                "The param 'table_config' and "
                "'environment_settings' cannot be used at the same time")

        gateway = get_gateway()
        if execution_environment is not None and environment_settings is None:
            if table_config is not None:
                j_tenv = gateway.jvm.BatchTableEnvironment.create(
                    execution_environment._j_execution_environment,
                    table_config._j_table_config)
            else:
                j_tenv = gateway.jvm.BatchTableEnvironment.create(
                    execution_environment._j_execution_environment)
            return BatchTableEnvironment(j_tenv, False)
        elif environment_settings is not None and \
                execution_environment is None and \
                table_config is None:
            if environment_settings.is_streaming_mode():
                raise ValueError(
                    "The environment settings for BatchTableEnvironment must be "
                    "set to batch mode.")
            j_tenv = gateway.jvm.TableEnvironment.create(
                environment_settings._j_environment_settings)
            j_planner_class = j_tenv.getPlanner().getClass()
            j_blink_planner_class = get_java_class(get_gateway(
            ).jvm.org.apache.flink.table.planner.delegation.PlannerBase)
            is_blink_planner = j_blink_planner_class.isAssignableFrom(
                j_planner_class)
            return BatchTableEnvironment(j_tenv, is_blink_planner)
예제 #16
0
    def create(stream_execution_environment,
               table_config=None,
               environment_settings=None):
        """
        Creates a :class:`TableEnvironment` for a
        :class:`~pyflink.datastream.StreamExecutionEnvironment`.

        Example:
        ::

            >>> env = StreamExecutionEnvironment.get_execution_environment()
            # create without optional parameters.
            >>> table_env = StreamTableEnvironment.create(env)
            # create with TableConfig
            >>> table_config = TableConfig()
            >>> table_config.set_null_check(False)
            >>> table_env = StreamTableEnvironment.create(env, table_config)
            # create with EnvrionmentSettings
            >>> environment_settings = EnvironmentSettings.new_instance().use_blink_planner() \\
            ...     .build()
            >>> table_env = StreamTableEnvironment.create(
            ...     env, environment_settings=environment_settings)


        :param stream_execution_environment: The
                                             :class:`~pyflink.datastream.StreamExecutionEnvironment`
                                             of the TableEnvironment.
        :type stream_execution_environment: pyflink.datastream.StreamExecutionEnvironment
        :param table_config: The configuration of the TableEnvironment, optional.
        :type table_config: TableConfig
        :param environment_settings: The environment settings used to instantiate the
                                     TableEnvironment. It provides the interfaces about planner
                                     selection(flink or blink), optional.
        :type environment_settings: pyflink.table.EnvironmentSettings
        :return: The :class:`StreamTableEnvironment` created from given StreamExecutionEnvironment
                 and configuration.
        :rtype: StreamTableEnvironment
        """
        if table_config is not None and environment_settings is not None:
            raise ValueError(
                "The param 'table_config' and "
                "'environment_settings' cannot be used at the same time")

        gateway = get_gateway()
        if table_config is not None:
            j_tenv = gateway.jvm.StreamTableEnvironment.create(
                stream_execution_environment._j_stream_execution_environment,
                table_config._j_table_config)
        elif environment_settings is not None:
            if not environment_settings.is_streaming_mode():
                raise ValueError(
                    "The environment settings for StreamTableEnvironment must be "
                    "set to streaming mode.")
            j_tenv = gateway.jvm.StreamTableEnvironment.create(
                stream_execution_environment._j_stream_execution_environment,
                environment_settings._j_environment_settings)
        else:
            j_tenv = gateway.jvm.StreamTableEnvironment.create(
                stream_execution_environment._j_stream_execution_environment)
        j_planner_class = j_tenv.getPlanner().getClass()
        j_blink_planner_class = get_java_class(get_gateway(
        ).jvm.org.apache.flink.table.planner.delegation.PlannerBase)
        is_blink_planner = j_blink_planner_class.isAssignableFrom(
            j_planner_class)
        return StreamTableEnvironment(j_tenv, is_blink_planner)