def startup_loopback_server(): from pyflink.common import Configuration from pyflink.fn_execution.beam.beam_worker_pool_service import \ BeamFnLoopbackWorkerPoolServicer config = Configuration(j_configuration=j_configuration) config.set_string("python.loopback-server.address", BeamFnLoopbackWorkerPoolServicer().start())
def test_side_output_late_data(self): self.env.set_parallelism(1) config = Configuration(j_configuration=get_j_env_configuration( self.env._j_stream_execution_environment)) config.set_integer('python.fn-execution.bundle.size', 1) jvm = get_gateway().jvm watermark_strategy = WatermarkStrategy( jvm.org.apache.flink.api.common.eventtime.WatermarkStrategy. forGenerator(jvm.org.apache.flink.streaming.api.functions.python. eventtime.PerElementWatermarkGenerator.getSupplier()) ).with_timestamp_assigner(SecondColumnTimestampAssigner()) tag = OutputTag('late-data', type_info=Types.ROW([Types.STRING(), Types.INT()])) ds1 = self.env.from_collection( [('a', 0), ('a', 8), ('a', 4), ('a', 6)], type_info=Types.ROW([Types.STRING(), Types.INT()])) ds2 = ds1.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda e: e[0]) \ .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \ .allowed_lateness(0) \ .side_output_late_data(tag) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) main_sink = DataStreamTestSinkFunction() ds2.add_sink(main_sink) side_sink = DataStreamTestSinkFunction() ds2.get_side_output(tag).add_sink(side_sink) self.env.execute('test_side_output_late_data') main_expected = ['(a,0,5,1)', '(a,5,10,2)'] self.assert_equals_sorted(main_expected, main_sink.get_results()) side_expected = ['+I[a, 4]'] self.assert_equals_sorted(side_expected, side_sink.get_results())
def startup_loopback_server(): from pyflink.common import Configuration from pyflink.fn_execution.beam.beam_worker_pool_service import \ BeamFnLoopbackWorkerPoolServicer config = Configuration(j_configuration=j_configuration) config.set_string("PYFLINK_LOOPBACK_SERVER_ADDRESS", BeamFnLoopbackWorkerPoolServicer().start())
def test_from_configuration(self): config = Configuration() config.set_string("execution.runtime-mode", "batch") actual_setting = EnvironmentSettings.from_configuration(config) self.assertFalse(actual_setting.is_streaming_mode(), "Use batch mode.")
def test_add_configuration(self): table_config = TableConfig.get_default() configuration = Configuration() configuration.set_string("k1", "v1") table_config.add_configuration(configuration) self.assertEqual(table_config.get("k1", ""), "v1")
def test_contains_key(self): conf = Configuration() conf.set_string("k1", "v1") contains_k1 = conf.contains_key("k1") contains_k2 = conf.contains_key("k2") self.assertTrue(contains_k1) self.assertFalse(contains_k2)
def test_deepcopy(self): conf = Configuration() conf.set_string("k1", "v1") conf2 = deepcopy(conf) self.assertEqual(conf2, conf) conf2.set_string("k1", "v2") self.assertNotEqual(conf2, conf)
def for_row_type(row_type: RowType, writer_properties: Optional[Configuration] = None, hadoop_config: Optional[Configuration] = None) \ -> BulkWriterFactory: """ Create a RowDataBulkWriterFactory that writes Row records with a defined RowType into Orc files in a batch fashion. Example: :: >>> row_type = DataTypes.ROW([ ... DataTypes.FIELD('string', DataTypes.STRING()), ... DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT())) ... ]) >>> row_type_info = Types.ROW_NAMED( ... ['string', 'int_array'], ... [Types.STRING(), Types.LIST(Types.INT())] ... ) >>> sink = FileSink.for_bulk_format( ... OUTPUT_DIR, OrcBulkWriters.for_row_type( ... row_type=row_type, ... writer_properties=Configuration(), ... hadoop_config=Configuration(), ... ) ... ).build() >>> ds.map(lambda e: e, output_type=row_type_info).sink_to(sink) Note that in the above example, an identity map to indicate its RowTypeInfo is necessary before ``sink_to`` when ``ds`` is a source stream producing **RowData** records, because RowDataBulkWriterFactory assumes the input record type is Row. """ if not isinstance(row_type, RowType): raise TypeError('row_type must be an instance of RowType') j_data_type = _to_java_data_type(row_type) jvm = get_gateway().jvm j_row_type = j_data_type.getLogicalType() orc_types = to_jarray( jvm.org.apache.flink.table.types.logical.LogicalType, [i for i in j_row_type.getChildren()]) type_description = jvm.org.apache.flink.orc \ .OrcSplitReaderUtil.logicalTypeToOrcType(j_row_type) if writer_properties is None: writer_properties = Configuration() if hadoop_config is None: hadoop_config = Configuration() return RowDataBulkWriterFactory( jvm.org.apache.flink.orc.writer.OrcBulkWriterFactory( jvm.org.apache.flink.orc.vector.RowDataVectorizer( type_description.toString(), orc_types), create_java_properties(writer_properties), create_hadoop_configuration(hadoop_config)), row_type)
def setUp(self): self.env = StreamExecutionEnvironment.get_execution_environment() self._load_dependency_jars() config = Configuration( j_configuration=get_j_env_configuration(self.env._j_stream_execution_environment)) config.set_boolean("execution.checkpointing.checkpoints-after-tasks-finish.enabled", True) self.env.set_parallelism(4) self.env.enable_checkpointing(100) self.env.set_restart_strategy(RestartStrategies.no_restart()) self.t_env = StreamTableEnvironment.create(self.env) self.temp_dir = tempfile.mkdtemp()
def setUp(self) -> None: from pyflink.datastream import StreamExecutionEnvironment super(DataStreamConversionTestCases, self).setUp() config = Configuration() config.set_string("akka.ask.timeout", "20 s") self.env = StreamExecutionEnvironment.get_execution_environment(config) self.t_env = StreamTableEnvironment.create(self.env) self.env.set_parallelism(2) self.t_env.get_config().set("python.fn-execution.bundle.size", "1") self.test_sink = DataStreamTestSinkFunction()
def test_get_execution_environment_with_config(self): configuration = Configuration() configuration.set_integer('parallelism.default', 12) configuration.set_string('pipeline.name', 'haha') env = StreamExecutionEnvironment.get_execution_environment(configuration) execution_config = env.get_config() self.assertEqual(execution_config.get_parallelism(), 12) config = Configuration( j_configuration=get_j_env_configuration(env._j_stream_execution_environment)) self.assertEqual(config.get_string('pipeline.name', ''), 'haha')
def _build_parquet_columnar_job(self, row_type: RowType): source = FileSource.for_bulk_file_format( ParquetColumnarRowInputFormat(row_type, Configuration(), 10, True, False), self.parquet_file_name ).build() ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'parquet-source') ds.map(lambda e: e).add_sink(self.test_sink)
def to_configuration(self) -> Configuration: """ Convert to `pyflink.common.Configuration`. :return: Configuration with specified value. """ return Configuration(j_configuration=self._j_environment_settings.toConfiguration())
def test_init(self): conf = Configuration() self.assertEqual(conf.to_dict(), dict()) conf.set_string("k1", "v1") conf2 = Configuration(conf) self.assertEqual(conf2.to_dict(), {"k1": "v1"})
def _build_parquet_columnar_job(self, row_type: RowType, parquet_file_name: str): source = FileSource.for_bulk_file_format( ParquetColumnarRowInputFormat(Configuration(), row_type, 10, True, True), parquet_file_name).build() ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'parquet-source') ds.map(PassThroughMapFunction()).add_sink(self.test_sink)
def get_configuration(self): """ Gives direct access to the underlying key-value map for advanced configuration. :return: Entire key-value configuration. :rtype: Configuration """ return Configuration(j_configuration=self._j_table_config.getConfiguration())
def get_configuration(self) -> Configuration: """ Get the underlying `pyflink.common.Configuration`. :return: Configuration with specified value. """ return Configuration( j_configuration=self._j_environment_settings.getConfiguration())
def get_configuration(self): """ Returns all key/value configuration. :return: All key/value configuration. :rtype: Configuration """ return Configuration( j_configuration=self._j_table_config.getConfiguration())
def to_configuration(self) -> Configuration: """ Convert to `pyflink.common.Configuration`. It sets the `table.planner` and `execution.runtime-mode` according to the current EnvironmentSetting. :return: Configuration with specified value. """ return Configuration(j_configuration=self._j_environment_settings.toConfiguration())
def test_key_set(self): conf = Configuration() conf.set_string("k1", "v1") conf.set_string("k2", "v2") conf.set_string("k3", "v3") key_set = conf.key_set() self.assertEqual(key_set, {"k1", "k2", "k3"})
def to_configuration(self) -> Configuration: """ Convert to `pyflink.common.Configuration`. :return: Configuration with specified value. .. note:: Deprecated in 1.15. Please use :func:`EnvironmentSettings.get_configuration` instead. """ return Configuration( j_configuration=self._j_environment_settings.toConfiguration())
def _generate_stream_graph(self, clear_transformations: bool = False, job_name: str = None) \ -> JavaObject: gateway = get_gateway() JPythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil # start BeamFnLoopbackWorkerPoolServicer when executed in MiniCluster j_configuration = get_j_env_configuration(self._j_stream_execution_environment) if not self._remote_mode and is_local_deployment(j_configuration): from pyflink.common import Configuration from pyflink.fn_execution.beam.beam_worker_pool_service import \ BeamFnLoopbackWorkerPoolServicer jvm = gateway.jvm env_config = JPythonConfigUtil.getEnvironmentConfig( self._j_stream_execution_environment) parallelism = self.get_parallelism() if parallelism > 1 and env_config.containsKey(jvm.PythonOptions.PYTHON_ARCHIVES.key()): import logging logging.warning("Lookback mode is disabled as python archives are used and the " "parallelism of the job is greater than 1. The Python user-defined " "functions will be executed in an independent Python process.") else: config = Configuration(j_configuration=j_configuration) config.set_string( "loopback.server.address", BeamFnLoopbackWorkerPoolServicer().start()) JPythonConfigUtil.configPythonOperator(self._j_stream_execution_environment) gateway.jvm.org.apache.flink.python.chain.PythonOperatorChainingOptimizer.apply( self._j_stream_execution_environment) JPythonConfigUtil.setPartitionCustomOperatorNumPartitions( get_field_value(self._j_stream_execution_environment, "transformations")) j_stream_graph = self._j_stream_execution_environment.getStreamGraph(clear_transformations) if job_name is not None: j_stream_graph.setJobName(job_name) return j_stream_graph
def for_row_type(row_type: 'RowType', writer_properties: Optional[Configuration] = None, hadoop_config: Optional[Configuration] = None) \ -> BulkWriterFactory: """ Create a :class:`~pyflink.common.serialization.BulkWriterFactory` that writes records with a predefined schema into Orc files in a batch fashion. :param row_type: The RowType of records, it should match the RowTypeInfo of Row records. :param writer_properties: Orc writer options. :param hadoop_config: Hadoop configuration. """ from pyflink.table.types import RowType if not isinstance(row_type, RowType): raise TypeError('row_type must be an instance of RowType') from pyflink.table.types import _to_java_data_type j_data_type = _to_java_data_type(row_type) jvm = get_gateway().jvm j_row_type = j_data_type.getLogicalType() orc_types = to_jarray( jvm.org.apache.flink.table.types.logical.LogicalType, [i for i in j_row_type.getChildren()]) type_description = jvm.org.apache.flink.orc \ .OrcSplitReaderUtil.logicalTypeToOrcType(j_row_type) if writer_properties is None: writer_properties = Configuration() if hadoop_config is None: hadoop_config = Configuration() return RowDataBulkWriterFactory( jvm.org.apache.flink.orc.writer.OrcBulkWriterFactory( jvm.org.apache.flink.orc.vector.RowDataVectorizer( type_description.toString(), orc_types), create_java_properties(writer_properties), create_hadoop_configuration(hadoop_config)), row_type)
def _write_row_data_to_parquet_file(path: str, row_type: RowType, rows: List[Row]): jvm = get_gateway().jvm flink = jvm.org.apache.flink j_output_stream = flink.core.fs.local.LocalDataOutputStream(jvm.java.io.File(path)) j_bulk_writer = flink.formats.parquet.row.ParquetRowDataBuilder.createWriterFactory( _to_java_data_type(row_type).getLogicalType(), create_hadoop_configuration(Configuration()), True, ).create(j_output_stream) row_row_converter = flink.table.data.conversion.RowRowConverter.create( _to_java_data_type(row_type) ) row_row_converter.open(row_row_converter.getClass().getClassLoader()) for row in rows: j_bulk_writer.addElement(row_row_converter.toInternal(to_java_data_structure(row))) j_bulk_writer.finish()
def __init__(self, row_type: RowType, hadoop_config: Optional[Configuration] = None, batch_size: int = 2048, is_utc_timestamp: bool = False, is_case_sensitive: bool = True): if not hadoop_config: hadoop_config = Configuration() jvm = get_gateway().jvm j_row_type = _to_java_data_type(row_type).getLogicalType() produced_type_info = jvm.org.apache.flink.table.runtime.typeutils. \ InternalTypeInfo.of(j_row_type) j_parquet_columnar_format = jvm.org.apache.flink.formats.parquet. \ ParquetColumnarRowInputFormat(create_hadoop_configuration(hadoop_config), j_row_type, produced_type_info, batch_size, is_utc_timestamp, is_case_sensitive) super().__init__(j_parquet_columnar_format)
def for_row_type(row_type: RowType, hadoop_config: Optional[Configuration] = None, utc_timestamp: bool = False) -> 'BulkWriterFactory': """ Create a RowDataBulkWriterFactory that writes Rows records with a defined RowType into Parquet files in a batch fashion. Example: :: >>> row_type = DataTypes.ROW([ ... DataTypes.FIELD('string', DataTypes.STRING()), ... DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT())) ... ]) >>> row_type_info = Types.ROW_NAMED( ... ['string', 'int_array'], ... [Types.STRING(), Types.LIST(Types.INT())] ... ) >>> sink = FileSink.for_bulk_format( ... OUTPUT_DIR, ParquetBulkWriter.for_row_type( ... row_type, ... hadoop_config=Configuration(), ... utc_timestamp=True, ... ) ... ).build() >>> ds.map(lambda e: e, output_type=row_type_info).sink_to(sink) Note that in the above example, an identity map to indicate its RowTypeInfo is necessary before ``sink_to`` when ``ds`` is a source stream producing **RowData** records, because RowDataBulkWriterFactory assumes the input record type is **Row** . """ if not hadoop_config: hadoop_config = Configuration() jvm = get_gateway().jvm JParquetRowDataBuilder = jvm.org.apache.flink.formats.parquet.row.ParquetRowDataBuilder return RowDataBulkWriterFactory( JParquetRowDataBuilder.createWriterFactory( _to_java_data_type(row_type).getLogicalType(), create_hadoop_configuration(hadoop_config), utc_timestamp), row_type)
def for_row_type(row_type: 'RowType', hadoop_config: Optional[Configuration] = None, utc_timestamp: bool = False) -> 'BulkWriterFactory': """ Create a :class:`~pyflink.common.serialization.BulkWriterFactory` that writes records with a predefined schema into Parquet files in a batch fashion. :param row_type: The RowType of records, it should match the RowTypeInfo of Row records. :param hadoop_config: Hadoop configuration. :param utc_timestamp: Use UTC timezone or local timezone to the conversion between epoch time and LocalDateTime. Hive 0.x/1.x/2.x use local timezone. But Hive 3.x use UTC timezone. """ if not hadoop_config: hadoop_config = Configuration() from pyflink.table.types import _to_java_data_type jvm = get_gateway().jvm JParquetRowDataBuilder = jvm.org.apache.flink.formats.parquet.row.ParquetRowDataBuilder return RowDataBulkWriterFactory(JParquetRowDataBuilder.createWriterFactory( _to_java_data_type(row_type).getLogicalType(), create_hadoop_configuration(hadoop_config), utc_timestamp ), row_type)
def test_add_all(self): conf = Configuration() conf.set_string("k1", "v1") conf2 = Configuration() conf2.add_all(conf) value1 = conf2.get_string("k1", "") self.assertEqual(value1, "v1") conf2.add_all(conf, "conf_") value2 = conf2.get_string("conf_k1", "") self.assertEqual(value2, "v1")
def test_add_all_to_dict(self): conf = Configuration() conf.set_string("k1", "v1") conf.set_integer("k2", 1) conf.set_float("k3", 1.2) conf.set_boolean("k4", True) conf.set_bytearray("k5", bytearray([1, 2, 3])) target_dict = dict() conf.add_all_to_dict(target_dict) self.assertEqual(target_dict, { "k1": "v1", "k2": 1, "k3": 1.2, "k4": True, "k5": bytearray([1, 2, 3]) })
def test_getters_and_setters(self): conf = Configuration() conf.set_string("str", "v1") conf.set_integer("int", 2) conf.set_boolean("bool", True) conf.set_float("float", 0.5) conf.set_bytearray("bytearray", bytearray([1, 2, 3])) str_value = conf.get_string("str", "") int_value = conf.get_integer("int", 0) bool_value = conf.get_boolean("bool", False) float_value = conf.get_float("float", 0) bytearray_value = conf.get_bytearray("bytearray", bytearray()) self.assertEqual(str_value, "v1") self.assertEqual(int_value, 2) self.assertEqual(bool_value, True) self.assertEqual(float_value, 0.5) self.assertEqual(bytearray_value, bytearray([1, 2, 3]))