def for_row_type(row_type: RowType, writer_properties: Optional[Configuration] = None, hadoop_config: Optional[Configuration] = None) \ -> BulkWriterFactory: """ Create a RowDataBulkWriterFactory that writes Row records with a defined RowType into Orc files in a batch fashion. Example: :: >>> row_type = DataTypes.ROW([ ... DataTypes.FIELD('string', DataTypes.STRING()), ... DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT())) ... ]) >>> row_type_info = Types.ROW_NAMED( ... ['string', 'int_array'], ... [Types.STRING(), Types.LIST(Types.INT())] ... ) >>> sink = FileSink.for_bulk_format( ... OUTPUT_DIR, OrcBulkWriters.for_row_type( ... row_type=row_type, ... writer_properties=Configuration(), ... hadoop_config=Configuration(), ... ) ... ).build() >>> ds.map(lambda e: e, output_type=row_type_info).sink_to(sink) Note that in the above example, an identity map to indicate its RowTypeInfo is necessary before ``sink_to`` when ``ds`` is a source stream producing **RowData** records, because RowDataBulkWriterFactory assumes the input record type is Row. """ if not isinstance(row_type, RowType): raise TypeError('row_type must be an instance of RowType') j_data_type = _to_java_data_type(row_type) jvm = get_gateway().jvm j_row_type = j_data_type.getLogicalType() orc_types = to_jarray( jvm.org.apache.flink.table.types.logical.LogicalType, [i for i in j_row_type.getChildren()]) type_description = jvm.org.apache.flink.orc \ .OrcSplitReaderUtil.logicalTypeToOrcType(j_row_type) if writer_properties is None: writer_properties = Configuration() if hadoop_config is None: hadoop_config = Configuration() return RowDataBulkWriterFactory( jvm.org.apache.flink.orc.writer.OrcBulkWriterFactory( jvm.org.apache.flink.orc.vector.RowDataVectorizer( type_description.toString(), orc_types), create_java_properties(writer_properties), create_hadoop_configuration(hadoop_config)), row_type)
def _write_row_data_to_parquet_file(path: str, row_type: RowType, rows: List[Row]): jvm = get_gateway().jvm flink = jvm.org.apache.flink j_output_stream = flink.core.fs.local.LocalDataOutputStream(jvm.java.io.File(path)) j_bulk_writer = flink.formats.parquet.row.ParquetRowDataBuilder.createWriterFactory( _to_java_data_type(row_type).getLogicalType(), create_hadoop_configuration(Configuration()), True, ).create(j_output_stream) row_row_converter = flink.table.data.conversion.RowRowConverter.create( _to_java_data_type(row_type) ) row_row_converter.open(row_row_converter.getClass().getClassLoader()) for row in rows: j_bulk_writer.addElement(row_row_converter.toInternal(to_java_data_structure(row))) j_bulk_writer.finish()
def __init__(self, row_type: RowType, hadoop_config: Optional[Configuration] = None, batch_size: int = 2048, is_utc_timestamp: bool = False, is_case_sensitive: bool = True): if not hadoop_config: hadoop_config = Configuration() jvm = get_gateway().jvm j_row_type = _to_java_data_type(row_type).getLogicalType() produced_type_info = jvm.org.apache.flink.table.runtime.typeutils. \ InternalTypeInfo.of(j_row_type) j_parquet_columnar_format = jvm.org.apache.flink.formats.parquet. \ ParquetColumnarRowInputFormat(create_hadoop_configuration(hadoop_config), j_row_type, produced_type_info, batch_size, is_utc_timestamp, is_case_sensitive) super().__init__(j_parquet_columnar_format)
def for_row_type(row_type: RowType, hadoop_config: Optional[Configuration] = None, utc_timestamp: bool = False) -> 'BulkWriterFactory': """ Create a RowDataBulkWriterFactory that writes Rows records with a defined RowType into Parquet files in a batch fashion. Example: :: >>> row_type = DataTypes.ROW([ ... DataTypes.FIELD('string', DataTypes.STRING()), ... DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT())) ... ]) >>> row_type_info = Types.ROW_NAMED( ... ['string', 'int_array'], ... [Types.STRING(), Types.LIST(Types.INT())] ... ) >>> sink = FileSink.for_bulk_format( ... OUTPUT_DIR, ParquetBulkWriter.for_row_type( ... row_type, ... hadoop_config=Configuration(), ... utc_timestamp=True, ... ) ... ).build() >>> ds.map(lambda e: e, output_type=row_type_info).sink_to(sink) Note that in the above example, an identity map to indicate its RowTypeInfo is necessary before ``sink_to`` when ``ds`` is a source stream producing **RowData** records, because RowDataBulkWriterFactory assumes the input record type is **Row** . """ if not hadoop_config: hadoop_config = Configuration() jvm = get_gateway().jvm JParquetRowDataBuilder = jvm.org.apache.flink.formats.parquet.row.ParquetRowDataBuilder return RowDataBulkWriterFactory( JParquetRowDataBuilder.createWriterFactory( _to_java_data_type(row_type).getLogicalType(), create_hadoop_configuration(hadoop_config), utc_timestamp), row_type)
def for_row_type(row_type: 'RowType', writer_properties: Optional[Configuration] = None, hadoop_config: Optional[Configuration] = None) \ -> BulkWriterFactory: """ Create a :class:`~pyflink.common.serialization.BulkWriterFactory` that writes records with a predefined schema into Orc files in a batch fashion. :param row_type: The RowType of records, it should match the RowTypeInfo of Row records. :param writer_properties: Orc writer options. :param hadoop_config: Hadoop configuration. """ from pyflink.table.types import RowType if not isinstance(row_type, RowType): raise TypeError('row_type must be an instance of RowType') from pyflink.table.types import _to_java_data_type j_data_type = _to_java_data_type(row_type) jvm = get_gateway().jvm j_row_type = j_data_type.getLogicalType() orc_types = to_jarray( jvm.org.apache.flink.table.types.logical.LogicalType, [i for i in j_row_type.getChildren()]) type_description = jvm.org.apache.flink.orc \ .OrcSplitReaderUtil.logicalTypeToOrcType(j_row_type) if writer_properties is None: writer_properties = Configuration() if hadoop_config is None: hadoop_config = Configuration() return RowDataBulkWriterFactory( jvm.org.apache.flink.orc.writer.OrcBulkWriterFactory( jvm.org.apache.flink.orc.vector.RowDataVectorizer( type_description.toString(), orc_types), create_java_properties(writer_properties), create_hadoop_configuration(hadoop_config)), row_type)
def for_row_type(row_type: 'RowType', hadoop_config: Optional[Configuration] = None, utc_timestamp: bool = False) -> 'BulkWriterFactory': """ Create a :class:`~pyflink.common.serialization.BulkWriterFactory` that writes records with a predefined schema into Parquet files in a batch fashion. :param row_type: The RowType of records, it should match the RowTypeInfo of Row records. :param hadoop_config: Hadoop configuration. :param utc_timestamp: Use UTC timezone or local timezone to the conversion between epoch time and LocalDateTime. Hive 0.x/1.x/2.x use local timezone. But Hive 3.x use UTC timezone. """ if not hadoop_config: hadoop_config = Configuration() from pyflink.table.types import _to_java_data_type jvm = get_gateway().jvm JParquetRowDataBuilder = jvm.org.apache.flink.formats.parquet.row.ParquetRowDataBuilder return RowDataBulkWriterFactory(JParquetRowDataBuilder.createWriterFactory( _to_java_data_type(row_type).getLogicalType(), create_hadoop_configuration(hadoop_config), utc_timestamp ), row_type)