def _check_record(data, topic, serialized_data): input_type = Types.ROW([Types.STRING()]) serialization_schema = KafkaRecordSerializationSchema.builder() \ .set_topic_selector(_select) \ .set_value_serialization_schema( JsonRowSerializationSchema.builder().with_type_info(input_type).build()) \ .build() jvm = get_gateway().jvm serialization_schema._j_serialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext(), jvm.org.apache.flink.connector.kafka.sink. DefaultKafkaSinkContext(0, 1, jvm.java.util.Properties())) sink = KafkaSink.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_record_serializer(serialization_schema) \ .build() ds = MockDataStream(Types.ROW([Types.STRING()])) ds.sink_to(sink) row = Row(data) topic_row = ds.feed(row) # type: Row j_record = serialization_schema._j_serialization_schema.serialize( to_java_data_structure(topic_row), None, None) self.assertEqual(j_record.topic(), topic) self.assertIsNone(j_record.key()) self.assertEqual(j_record.value(), serialized_data)
def _build_orc_job(self, row_type: RowType, row_type_info: RowTypeInfo, data: List[Row]): jvm = get_gateway().jvm sink = FileSink.for_bulk_format( self.orc_dir_name, OrcBulkWriters.for_row_type(row_type) ).build() j_list = jvm.java.util.ArrayList() for d in data: j_list.add(to_java_data_structure(d)) ds = DataStream(self.env._j_stream_execution_environment.fromCollection( j_list, row_type_info.get_java_type_info() )) ds.sink_to(sink)
def _write_row_data_to_parquet_file(path: str, row_type: RowType, rows: List[Row]): jvm = get_gateway().jvm flink = jvm.org.apache.flink j_output_stream = flink.core.fs.local.LocalDataOutputStream(jvm.java.io.File(path)) j_bulk_writer = flink.formats.parquet.row.ParquetRowDataBuilder.createWriterFactory( _to_java_data_type(row_type).getLogicalType(), create_hadoop_configuration(Configuration()), True, ).create(j_output_stream) row_row_converter = flink.table.data.conversion.RowRowConverter.create( _to_java_data_type(row_type) ) row_row_converter.open(row_row_converter.getClass().getClassLoader()) for row in rows: j_bulk_writer.addElement(row_row_converter.toInternal(to_java_data_structure(row))) j_bulk_writer.finish()
def test_set_topic(self): input_type = Types.ROW([Types.STRING()]) serialization_schema = KafkaRecordSerializationSchema.builder() \ .set_topic('test-topic') \ .set_value_serialization_schema( JsonRowSerializationSchema.builder().with_type_info(input_type).build()) \ .build() jvm = get_gateway().jvm serialization_schema._j_serialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext(), jvm.org.apache.flink.connector.kafka.sink.DefaultKafkaSinkContext( 0, 1, jvm.java.util.Properties())) j_record = serialization_schema._j_serialization_schema.serialize( to_java_data_structure(Row('test')), None, None) self.assertEqual(j_record.topic(), 'test-topic') self.assertIsNone(j_record.key()) self.assertEqual(j_record.value(), b'{"f0":"test"}')