def _from_j_state_backend(j_state_backend): if j_state_backend is None: return None gateway = get_gateway() JStateBackend = gateway.jvm.org.apache.flink.runtime.state.StateBackend JMemoryStateBackend = gateway.jvm.org.apache.flink.runtime.state.memory.MemoryStateBackend JFsStateBackend = gateway.jvm.org.apache.flink.runtime.state.filesystem.FsStateBackend JRocksDBStateBackend = gateway.jvm.org.apache.flink.contrib.streaming.state.RocksDBStateBackend j_clz = j_state_backend.getClass() if not get_java_class(JStateBackend).isAssignableFrom(j_clz): raise TypeError("The input %s is not an instance of StateBackend." % j_state_backend) if get_java_class(JMemoryStateBackend).isAssignableFrom( j_state_backend.getClass()): return MemoryStateBackend(j_memory_state_backend=j_state_backend) elif get_java_class(JFsStateBackend).isAssignableFrom( j_state_backend.getClass()): return FsStateBackend(j_fs_state_backend=j_state_backend) elif get_java_class(JRocksDBStateBackend).isAssignableFrom( j_state_backend.getClass()): return RocksDBStateBackend(j_rocks_db_state_backend=j_state_backend) else: return CustomStateBackend( j_state_backend) # users' customized state backend
def _from_j_restart_strategy(j_restart_strategy): if j_restart_strategy is None: return None gateway = get_gateway() NoRestartStrategyConfiguration = gateway.jvm.RestartStrategies\ .NoRestartStrategyConfiguration FixedDelayRestartStrategyConfiguration = gateway.jvm.RestartStrategies\ .FixedDelayRestartStrategyConfiguration FailureRateRestartStrategyConfiguration = gateway.jvm.RestartStrategies\ .FailureRateRestartStrategyConfiguration FallbackRestartStrategyConfiguration = gateway.jvm.RestartStrategies\ .FallbackRestartStrategyConfiguration clz = j_restart_strategy.getClass() if clz.getName() == get_java_class(NoRestartStrategyConfiguration).getName(): return RestartStrategies.NoRestartStrategyConfiguration( j_restart_strategy=j_restart_strategy) elif clz.getName() == get_java_class(FixedDelayRestartStrategyConfiguration).getName(): return RestartStrategies.FixedDelayRestartStrategyConfiguration( j_restart_strategy=j_restart_strategy) elif clz.getName() == get_java_class(FailureRateRestartStrategyConfiguration).getName(): return RestartStrategies.FailureRateRestartStrategyConfiguration( j_restart_strategy=j_restart_strategy) elif clz.getName() == get_java_class(FallbackRestartStrategyConfiguration).getName(): return RestartStrategies.FallbackRestartStrategyConfiguration( j_restart_strategy=j_restart_strategy) else: raise Exception("Unsupported java RestartStrategyConfiguration: %s" % clz.getName())
def for_schema(schema: 'CsvSchema') -> 'CsvReaderFormat': """ Builds a :class:`CsvReaderFormat` using `CsvSchema`. """ jvm = get_gateway().jvm jackson = jvm.org.apache.flink.shaded.jackson2.com.fasterxml.jackson constructor = get_java_class(jvm.org.apache.flink.formats.csv.CsvReaderFormat) \ .getDeclaredConstructor( to_jarray(jvm.Class, [ get_java_class(jackson.dataformat.csv.CsvMapper), get_java_class(jackson.dataformat.csv.CsvSchema), get_java_class(jvm.Class), get_java_class(jvm.org.apache.flink.formats.common.Converter), get_java_class(jvm.org.apache.flink.api.common.typeinfo.TypeInformation), get_java_class(jvm.boolean) ]) ) constructor.setAccessible(True) j_csv_format = constructor.newInstance( to_jarray(jvm.Object, [ jackson.dataformat.csv.CsvMapper(), schema._j_schema, get_java_class(jackson.databind.JsonNode), jvm.org.apache.flink.formats.csv.CsvToRowDataConverters( False).createRowConverter( _to_java_data_type(schema._data_type).getLogicalType(), True), jvm.org.apache.flink.table.runtime.typeutils.InternalTypeInfo. of(_to_java_data_type( schema._data_type).getLogicalType()), False ])) return CsvReaderFormat(j_csv_format)
def pipe(self, cmd, tFormatter, xFormatter, convFn, files=None, environment=None, flankSize=0): """ Pipes genomic data to a subprocess that runs in parallel using Spark. Files are substituted in to the command with a $x syntax. E.g., to invoke a command that uses the first file from the files Seq, use $0. To access the path to the directory where the files are copied, use $root. Pipes require the presence of an InFormatterCompanion and an OutFormatter as implicit values. The InFormatterCompanion should be a singleton whose apply method builds an InFormatter given a specific type of GenomicRDD. The implicit InFormatterCompanion yields an InFormatter which is used to format the input to the pipe, and the implicit OutFormatter is used to parse the output from the pipe. :param list cmd: The command to run. :param str tFormatter: The name of the ADAM in-formatter class to use. :param str xFormatter: The name of the ADAM out-formatter class to use. :param str convFn: The name of the ADAM GenomicRDD conversion class to use. :param list files: The files to copy locally onto all executors. Set to None (default) to omit. :param dict environment: The environment variables to set on the executor. Set to None (default) to omit. :param int flankSize: The number of bases of flanking sequence to have around each partition. Defaults to 0. :return: Returns a new RDD where the input from the original RDD has been piped through a command that runs locally on each executor. """ jvm = self.sc._jvm tFormatterClass = get_java_class(getattr(jvm, tFormatter)) xFormatterInst = getattr(jvm, xFormatter)() convFnInst = getattr(jvm, convFn)() if files is None: files = [] if environment is None: environment = {} return self._replaceRdd(self._jvmRdd.pipe(cmd, files, environment, flankSize, tFormatterClass, xFormatterInst, convFnInst))
def testGetJavaClass(self): ArrayList = self.gateway.jvm.java.util.ArrayList clazz1 = ArrayList._java_lang_class clazz2 = get_java_class(ArrayList) self.assertEqual("java.util.ArrayList", clazz1.getName()) self.assertEqual("java.util.ArrayList", clazz2.getName()) self.assertEqual("java.lang.Class", clazz1.getClass().getName()) self.assertEqual("java.lang.Class", clazz2.getClass().getName())
def testGetJavaClass(self): ArrayList = self.gateway.jvm.java.util.ArrayList clazz1 = ArrayList._java_lang_class clazz2 = get_java_class(ArrayList) self.assertEqual("java.util.ArrayList", clazz1.getName()) self.assertEqual("java.util.ArrayList", clazz2.getName()) self.assertEqual("java.lang.Class", clazz1.getClass().getName()) self.assertEqual("java.lang.Class", clazz2.getClass().getName())
def __init__(self, path: str, schema: 'AvroSchema'): """ :param path: The path to Avro data file. :param schema: The :class:`AvroSchema` of generic record. """ jvm = get_gateway().jvm j_avro_input_format = jvm.org.apache.flink.formats.avro.AvroInputFormat( jvm.org.apache.flink.core.fs.Path(path), get_java_class(jvm.org.apache.flink.avro.shaded.org.apache.avro. generic.GenericRecord)) super().__init__(j_avro_input_format) self._type_info = GenericRecordAvroTypeInfo(schema)
def _get_execution_config(self, filename, schema): gateway = get_gateway() blink_t_env_class = get_java_class(gateway.jvm.org.apache.flink.table. api.internal.TableEnvironmentImpl) is_blink = (blink_t_env_class == self._j_tenv.getClass()) if is_blink: # we can not get ExecutionConfig object from the TableEnvironmentImpl # for the moment, just create a new ExecutionConfig. execution_config = gateway.jvm.org.apache.flink.api.common.ExecutionConfig( ) else: execution_config = self._j_tenv.execEnv().getConfig() return execution_config
def _from_file(self, filename, schema): gateway = get_gateway() blink_t_env_class = get_java_class(gateway.jvm.org.apache.flink.table. api.internal.TableEnvironmentImpl) if blink_t_env_class == self._j_tenv.getClass(): raise NotImplementedError( "The operation 'from_elements' in batch mode is currently " "not supported when using blink planner.") else: jds = gateway.jvm.PythonBridgeUtils.createDataSetFromFile( self._j_tenv.execEnv(), filename, True) return Table( gateway.jvm.PythonTableUtils.fromDataSet( self._j_tenv, jds, _to_java_type(schema)))
def _from_j_checkpoint_storage(j_checkpoint_storage): if j_checkpoint_storage is None: return None gateway = get_gateway() JCheckpointStorage = gateway.jvm.org.apache.flink.runtime.state.CheckpointStorage JJobManagerCheckpointStorage = gateway.jvm.org.apache.flink.runtime.state.storage \ .JobManagerCheckpointStorage JFileSystemCheckpointStorage = gateway.jvm.org.apache.flink.runtime.state.storage \ .FileSystemCheckpointStorage j_clz = j_checkpoint_storage.getClass() if not get_java_class(JCheckpointStorage).isAssignableFrom(j_clz): raise TypeError("%s is not an instance of CheckpointStorage." % j_checkpoint_storage) if get_java_class(JJobManagerCheckpointStorage).isAssignableFrom(j_clz): return JobManagerCheckpointStorage( j_jobmanager_checkpoint_storage=j_checkpoint_storage) elif get_java_class(JFileSystemCheckpointStorage).isAssignableFrom(j_clz): return FileSystemCheckpointStorage( j_filesystem_checkpoint_storage=j_checkpoint_storage) else: return CustomCheckpointStorage(j_checkpoint_storage)
def is_instance_of(java_object, java_class): gateway = get_gateway() if isinstance(java_class, str): param = java_class elif isinstance(java_class, JavaClass): param = get_java_class(java_class) elif isinstance(java_class, JavaObject): if not is_instance_of(java_class, gateway.jvm.Class): param = java_class.getClass() else: param = java_class else: raise TypeError( "java_class must be a string, a JavaClass, or a JavaObject") return gateway.jvm.org.apache.flink.api.python.shaded.py4j.reflection.TypeUtil.isInstanceOf( param, java_object)
def connect(self, connector_descriptor): """ Creates a table source and/or table sink from a descriptor. Descriptors allow for declaring the communication to external systems in an implementation-agnostic way. The classpath is scanned for suitable table factories that match the desired configuration. The following example shows how to read from a connector using a JSON format and registering a table source as "MyTable": :: >>> table_env \\ ... .connect(ExternalSystemXYZ() ... .version("0.11")) \\ ... .with_format(Json() ... .json_schema("{...}") ... .fail_on_missing_field(False)) \\ ... .with_schema(Schema() ... .field("user-name", "VARCHAR") ... .from_origin_field("u_name") ... .field("count", "DECIMAL")) \\ ... .register_table_source("MyTable") :param connector_descriptor: Connector descriptor describing the external system. :type connector_descriptor: ConnectorDescriptor :return: A :class:`BatchTableDescriptor` or a :class:`StreamTableDescriptor` (for blink planner) used to build the table source/sink. :rtype: BatchTableDescriptor or StreamTableDescriptor """ gateway = get_gateway() blink_t_env_class = get_java_class(gateway.jvm.org.apache.flink.table. api.internal.TableEnvironmentImpl) if blink_t_env_class == self._j_tenv.getClass(): return StreamTableDescriptor( self._j_tenv.connect( connector_descriptor._j_connector_descriptor)) else: return BatchTableDescriptor( self._j_tenv.connect( connector_descriptor._j_connector_descriptor))
def set_options(self, options_factory_class_name: str): """ Sets ``org.rocksdb.Options`` for the RocksDB instances. Because the options are not serializable and hold native code references, they must be specified through a factory. The options created by the factory here are applied on top of the pre-defined options profile selected via :func:`set_predefined_options`. If the pre-defined options profile is the default (:data:`PredefinedOptions.DEFAULT`), then the factory fully controls the RocksDB options. :param options_factory_class_name: The fully-qualified class name of the options factory in Java that lazily creates the RocksDB options. The options factory must have a default constructor. """ gateway = get_gateway() JOptionsFactory = gateway.jvm.org.apache.flink.contrib.streaming.state.RocksDBOptionsFactory j_options_factory_clz = load_java_class(options_factory_class_name) if not get_java_class(JOptionsFactory).isAssignableFrom(j_options_factory_clz): raise ValueError("The input class does not implement RocksDBOptionsFactory.") self._j_rocks_db_state_backend.setRocksDBOptions(j_options_factory_clz.newInstance())
def offsets( offsets: Dict['KafkaTopicPartition', int], offset_reset_strategy: 'KafkaOffsetResetStrategy' = KafkaOffsetResetStrategy.EARLIEST ) -> 'KafkaOffsetsInitializer': """ Get an :class:`KafkaOffsetsInitializer` which initializes the offsets to the specified offsets. An optional :class:`KafkaOffsetResetStrategy` can be specified to initialize the offsets in case the specified offset is out of range. Example: :: >>> KafkaOffsetsInitializer.offsets({ ... KafkaTopicPartition('TOPIC1', 0): 0, ... KafkaTopicPartition('TOPIC1', 1): 10000 ... }, KafkaOffsetResetStrategy.EARLIEST) :param offsets: the specified offsets for each partition. :param offset_reset_strategy: the :class:`KafkaOffsetResetStrategy` to use when the specified offset is out of range. :return: an :class:`KafkaOffsetsInitializer` which initializes the offsets to the specified offsets. """ jvm = get_gateway().jvm j_map_wrapper = jvm.org.apache.flink.python.util.HashMapWrapper( None, get_java_class(jvm.Long)) for tp, offset in offsets.items(): j_map_wrapper.put(tp._to_j_topic_partition(), offset) JOffsetsInitializer = get_gateway().jvm.org.apache.flink.connector.kafka.source. \ enumerator.initializer.OffsetsInitializer return KafkaOffsetsInitializer( JOffsetsInitializer.offsets( j_map_wrapper.asMap(), offset_reset_strategy._to_j_offset_reset_strategy()))
def create(execution_environment=None, table_config=None, environment_settings=None): """ Creates a :class:`BatchTableEnvironment`. Example: :: # create with ExecutionEnvironment. >>> env = ExecutionEnvironment.get_execution_environment() >>> table_env = BatchTableEnvironment.create(env) # create with ExecutionEnvironment and TableConfig. >>> table_config = TableConfig() >>> table_config.set_null_check(False) >>> table_env = BatchTableEnvironment.create(env, table_config) # create with EnvironmentSettings. >>> environment_settings = EnvironmentSettings.new_instance().in_batch_mode() \\ ... .use_blink_planner().build() >>> table_env = BatchTableEnvironment.create(environment_settings=environment_settings) :param execution_environment: The batch :class:`pyflink.dataset.ExecutionEnvironment` of the TableEnvironment. :type execution_environment: pyflink.dataset.ExecutionEnvironment :param table_config: The configuration of the TableEnvironment, optional. :type table_config: TableConfig :param environment_settings: The environment settings used to instantiate the TableEnvironment. It provides the interfaces about planner selection(flink or blink), optional. :type environment_settings: pyflink.table.EnvironmentSettings :return: The BatchTableEnvironment created from given ExecutionEnvironment and configuration. :rtype: BatchTableEnvironment """ if execution_environment is None and \ table_config is None and \ environment_settings is None: raise ValueError( "No argument found, the param 'execution_environment' " "or 'environment_settings' is required.") elif execution_environment is None and \ table_config is not None and \ environment_settings is None: raise ValueError( "Only the param 'table_config' is found, " "the param 'execution_environment' is also required.") elif execution_environment is not None and \ environment_settings is not None: raise ValueError( "The param 'execution_environment' and " "'environment_settings' cannot be used at the same time") elif table_config is not None and \ environment_settings is not None: raise ValueError( "The param 'table_config' and " "'environment_settings' cannot be used at the same time") gateway = get_gateway() if execution_environment is not None and environment_settings is None: if table_config is not None: j_tenv = gateway.jvm.BatchTableEnvironment.create( execution_environment._j_execution_environment, table_config._j_table_config) else: j_tenv = gateway.jvm.BatchTableEnvironment.create( execution_environment._j_execution_environment) return BatchTableEnvironment(j_tenv, False) elif environment_settings is not None and \ execution_environment is None and \ table_config is None: if environment_settings.is_streaming_mode(): raise ValueError( "The environment settings for BatchTableEnvironment must be " "set to batch mode.") j_tenv = gateway.jvm.TableEnvironment.create( environment_settings._j_environment_settings) j_planner_class = j_tenv.getPlanner().getClass() j_blink_planner_class = get_java_class(get_gateway( ).jvm.org.apache.flink.table.planner.delegation.PlannerBase) is_blink_planner = j_blink_planner_class.isAssignableFrom( j_planner_class) return BatchTableEnvironment(j_tenv, is_blink_planner)
def create(stream_execution_environment, table_config=None, environment_settings=None): """ Creates a :class:`TableEnvironment` for a :class:`~pyflink.datastream.StreamExecutionEnvironment`. Example: :: >>> env = StreamExecutionEnvironment.get_execution_environment() # create without optional parameters. >>> table_env = StreamTableEnvironment.create(env) # create with TableConfig >>> table_config = TableConfig() >>> table_config.set_null_check(False) >>> table_env = StreamTableEnvironment.create(env, table_config) # create with EnvrionmentSettings >>> environment_settings = EnvironmentSettings.new_instance().use_blink_planner() \\ ... .build() >>> table_env = StreamTableEnvironment.create( ... env, environment_settings=environment_settings) :param stream_execution_environment: The :class:`~pyflink.datastream.StreamExecutionEnvironment` of the TableEnvironment. :type stream_execution_environment: pyflink.datastream.StreamExecutionEnvironment :param table_config: The configuration of the TableEnvironment, optional. :type table_config: TableConfig :param environment_settings: The environment settings used to instantiate the TableEnvironment. It provides the interfaces about planner selection(flink or blink), optional. :type environment_settings: pyflink.table.EnvironmentSettings :return: The :class:`StreamTableEnvironment` created from given StreamExecutionEnvironment and configuration. :rtype: StreamTableEnvironment """ if table_config is not None and environment_settings is not None: raise ValueError( "The param 'table_config' and " "'environment_settings' cannot be used at the same time") gateway = get_gateway() if table_config is not None: j_tenv = gateway.jvm.StreamTableEnvironment.create( stream_execution_environment._j_stream_execution_environment, table_config._j_table_config) elif environment_settings is not None: if not environment_settings.is_streaming_mode(): raise ValueError( "The environment settings for StreamTableEnvironment must be " "set to streaming mode.") j_tenv = gateway.jvm.StreamTableEnvironment.create( stream_execution_environment._j_stream_execution_environment, environment_settings._j_environment_settings) else: j_tenv = gateway.jvm.StreamTableEnvironment.create( stream_execution_environment._j_stream_execution_environment) j_planner_class = j_tenv.getPlanner().getClass() j_blink_planner_class = get_java_class(get_gateway( ).jvm.org.apache.flink.table.planner.delegation.PlannerBase) is_blink_planner = j_blink_planner_class.isAssignableFrom( j_planner_class) return StreamTableEnvironment(j_tenv, is_blink_planner)