def test_cython_row_coder(self): from pyflink.common import Row, RowKind field_count = 2 field_names = ['f{}'.format(i) for i in range(field_count)] row = Row(**{ field_names[i]: None if i % 2 == 0 else i for i in range(field_count) }) data = [row] python_field_coders = [ coder_impl.RowCoderImpl( [coder_impl.BigIntCoderImpl() for _ in range(field_count)], field_names) ] cython_field_coders = [ coder_impl_fast.RowCoderImpl([ coder_impl_fast.BigIntCoderImpl() for _ in range(field_count) ], field_names) ] row.set_row_kind(RowKind.INSERT) self.check_cython_coder(python_field_coders, cython_field_coders, data) row.set_row_kind(RowKind.UPDATE_BEFORE) self.check_cython_coder(python_field_coders, cython_field_coders, data) row.set_row_kind(RowKind.UPDATE_AFTER) self.check_cython_coder(python_field_coders, cython_field_coders, data) row.set_row_kind(RowKind.DELETE) self.check_cython_coder(python_field_coders, cython_field_coders, data)
def test_mixed_with_built_in_functions_without_retract(self): self.t_env.get_config().get_configuration().set_string("parallelism.default", "1") self.t_env.create_temporary_system_function( "concat", ConcatAggregateFunction()) t = self.t_env.from_elements( [('Hi', 2), ('Hi', 4), (None, None), ('hello2', 8), ('hello', 10)], ['b', 'c']) self.t_env.create_temporary_view("source", t) result_table = self.t_env.sql_query( "select concat(b, ',') as a, " "FIRST_VALUE(b) as b, " "LAST_VALUE(b) as c, " "COUNT(c) as d, " "COUNT(1) as e, " "LISTAGG(b) as f," "LISTAGG(b, '|') as g," "MAX(c) as h," "MAX(cast(c as float) + 1) as i," "MIN(c) as j," "MIN(cast(c as decimal) + 1) as k," "SUM(c) as l," "SUM(cast(c as float) + 1) as m " "from source") result = [i for i in result_table.execute().collect()] expected = Row('Hi,Hi,hello,hello2', 'Hi', 'hello', 4, 5, 'Hi,Hi,hello2,hello', 'Hi|Hi|hello2|hello', 10, 11.0, 2, Decimal(3.0), 24, 28.0) expected.set_row_kind(RowKind.UPDATE_AFTER) self.assertEqual(result[len(result) - 1], expected)
def setUpClass(cls): super(PandasConversionTestBase, cls).setUpClass() cls.data = [(1, 1, 1, 1, True, 1.1, 1.2, 'hello', bytearray(b"aaa"), decimal.Decimal('1000000000000000000.01'), datetime.date(2014, 9, 13), datetime.time(hour=1, minute=0, second=1), datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), ['hello', '中文'], Row(a=1, b='hello', c=datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), d=[1, 2])), (1, 2, 2, 2, False, 2.1, 2.2, 'world', bytearray(b"bbb"), decimal.Decimal('1000000000000000000.02'), datetime.date(2014, 9, 13), datetime.time(hour=1, minute=0, second=1), datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), ['hello', '中文'], Row(a=1, b='hello', c=datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), d=[1, 2]))] cls.data_type = DataTypes.ROW( [DataTypes.FIELD("f1", DataTypes.TINYINT()), DataTypes.FIELD("f2", DataTypes.SMALLINT()), DataTypes.FIELD("f3", DataTypes.INT()), DataTypes.FIELD("f4", DataTypes.BIGINT()), DataTypes.FIELD("f5", DataTypes.BOOLEAN()), DataTypes.FIELD("f6", DataTypes.FLOAT()), DataTypes.FIELD("f7", DataTypes.DOUBLE()), DataTypes.FIELD("f8", DataTypes.STRING()), DataTypes.FIELD("f9", DataTypes.BYTES()), DataTypes.FIELD("f10", DataTypes.DECIMAL(38, 18)), DataTypes.FIELD("f11", DataTypes.DATE()), DataTypes.FIELD("f12", DataTypes.TIME()), DataTypes.FIELD("f13", DataTypes.TIMESTAMP(3)), DataTypes.FIELD("f14", DataTypes.ARRAY(DataTypes.STRING())), DataTypes.FIELD("f15", DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD("b", DataTypes.STRING()), DataTypes.FIELD("c", DataTypes.TIMESTAMP(3)), DataTypes.FIELD("d", DataTypes.ARRAY(DataTypes.INT()))]))], False) cls.pdf = cls.create_pandas_data_frame()
def test_map(self): t = self.t_env.from_elements( [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) sink_table_ddl = """ CREATE TABLE Results(a BIGINT, b BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) func = udf(lambda x: Row(a=x + 1, b=x * x), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.BIGINT())])) func2 = udf(lambda x: Row(x.a + 1, x.b * 2), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.BIGINT())])) t.map(func(t.b)).alias("a", "b") \ .map(func(t.a)) \ .map(func2) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals( actual, ["+I[5, 18]", "+I[4, 8]", "+I[8, 72]", "+I[11, 162]", "+I[6, 32]"])
def test_map(self): t = self.t_env.from_elements( [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()) ])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) func = udf(lambda x: Row(a=x + 1, b=x * x), result_type=DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.BIGINT()) ])) func2 = udf(lambda x: Row(x.a + 1, x.b * 2), result_type=DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.BIGINT()) ])) t.map(func(t.b)).alias("a", "b") \ .map(func(t.a)) \ .map(func2) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals( actual, ["+I[5, 18]", "+I[4, 8]", "+I[8, 72]", "+I[11, 162]", "+I[6, 32]"])
def batch_seq_num_test(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(2) env.set_runtime_mode(RuntimeExecutionMode.BATCH) seq_num_source = NumberSequenceSource(1, 1000) output_path = '/opt/examples/output/batch_seq_num' file_sink = FileSink \ .for_row_format(output_path, Encoder.simple_string_encoder()) \ .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \ .build() ds = env.from_source( source=seq_num_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name='file_source', type_info=Types.LONG()) ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \ .key_by(lambda a: a[0]) \ .reduce(lambda a, b: Row(a[0], a[1] + b[1])) \ .sink_to(file_sink) env.execute('9-data_stream_batch_seq_num')
class RowDataConverter(DataConverter): def __init__(self, field_data_converters: List[DataConverter], field_names: List[str]): self._field_data_converters = field_data_converters self._reuse_row = Row() self._reuse_external_row_data = [ None for _ in range(len(field_data_converters)) ] self._reuse_external_row = [None, self._reuse_external_row_data] self._reuse_row.set_field_names(field_names) def to_internal(self, value) -> IN: if value is None: return None self._reuse_row._values = [ self._field_data_converters[i].to_internal(item) for i, item in enumerate(value[1]) ] self._reuse_row.set_row_kind(RowKind(value[0])) return self._reuse_row def to_external(self, value: Row) -> OUT: if value is None: return None self._reuse_external_row[0] = value.get_row_kind().value values = value._values for i in range(len(values)): self._reuse_external_row_data[i] = self._field_data_converters[ i].to_external(values[i]) return self._reuse_external_row
def setUp(self): super(StringIndexerTest, self).setUp() self.train_table = self.t_env.from_data_stream( self.env.from_collection([ ('a', 1.0), ('b', 1.0), ('b', 2.0), ('c', 0.0), ('d', 2.0), ('a', 2.0), ('b', 2.0), ('b', -1.0), ('a', -1.0), ('c', -1.0), ], type_info=Types.ROW_NAMED( ['input_col1', 'input_col2'], [Types.STRING(), Types.DOUBLE()]))) self.predict_table = self.t_env.from_data_stream( self.env.from_collection([ ('a', 2.0), ('b', 1.0), ('e', 2.0), ], type_info=Types.ROW_NAMED( ['input_col1', 'input_col2'], [Types.STRING(), Types.DOUBLE()]))) self.expected_alphabetic_asc_predict_data = [ Row('a', 2.0, 0, 3), Row('b', 1.0, 1, 2), Row('e', 2.0, 4, 3) ]
def data_stream_batch_test(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(2) env.set_runtime_mode(RuntimeExecutionMode.BATCH) input_path = '/opt/examples/data/word_count_input' output_path = '/opt/examples/output/data_stream_batch' file_source = FileSource\ .for_record_stream_format( StreamFormat.text_line_format(), input_path) \ .process_static_file_set() \ .build() file_sink = FileSink \ .for_row_format(output_path, Encoder.simple_string_encoder()) \ .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \ .build() ds = env.from_source( source=file_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name='file_source', type_info=Types.STRING()) ds.map(lambda a: Row(a, 1), output_type=Types.ROW([Types.STRING(), Types.INT()])) \ .key_by(lambda a: a[0]) \ .reduce(lambda a, b: Row(a[0], a[1] + b[1])) \ .sink_to(file_sink) env.execute('8-data_stream_batch')
def test_cython_row_coder(self): from pyflink.common import Row, RowKind field_count = 2 row = Row(*[None if i % 2 == 0 else i for i in range(field_count)]) data = [row] python_field_coders = [ coder_impl.RowCoderImpl( [coder_impl.BigIntCoderImpl() for _ in range(field_count)]) ] cython_field_coders = [ coder_impl_fast.RowCoderImpl([ coder_impl_fast.BigIntCoderImpl() for _ in range(field_count) ]) ] row.set_row_kind(RowKind.INSERT) self.check_cython_coder(python_field_coders, cython_field_coders, [data]) row.set_row_kind(RowKind.UPDATE_BEFORE) self.check_cython_coder(python_field_coders, cython_field_coders, [data]) row.set_row_kind(RowKind.UPDATE_AFTER) self.check_cython_coder(python_field_coders, cython_field_coders, [data]) row.set_row_kind(RowKind.DELETE) self.check_cython_coder(python_field_coders, cython_field_coders, [data])
def test_map_with_pandas_udf(self): t = self.t_env.from_elements( [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)), (2, Row(3, 4))], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD( "b", DataTypes.ROW([ DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("d", DataTypes.INT()) ])) ])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) def func(x): import pandas as pd res = pd.concat([x.a, x.c + x.d], axis=1) return res def func2(x): return x * 2 def func3(x): assert isinstance(x, Row) return x pandas_udf = udf(func, result_type=DataTypes.ROW([ DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT()) ]), func_type='pandas') pandas_udf_2 = udf(func2, result_type=DataTypes.ROW([ DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT()) ]), func_type='pandas') general_udf = udf(func3, result_type=DataTypes.ROW([ DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT()) ])) t.map(pandas_udf).map(pandas_udf_2).map(general_udf).execute_insert( "Results").wait() actual = source_sink_utils.results() self.assert_equals( actual, ["+I[4, 8]", "+I[2, 10]", "+I[2, 28]", "+I[2, 18]", "+I[4, 14]"])
def __init__(self, field_data_converters: List[DataConverter], field_names: List[str]): self._field_data_converters = field_data_converters self._reuse_row = Row() self._reuse_external_row_data = [ None for _ in range(len(field_data_converters)) ] self._reuse_external_row = [None, self._reuse_external_row_data] self._reuse_row.set_field_names(field_names)
def encode_to_stream(self, value: Row, out_stream: OutputStream): # encode mask value values = value.get_fields_by_names(self._field_names) self._mask_utils.write_mask(values, value.get_row_kind().value, out_stream) # encode every field value for i in range(self._field_count): item = values[i] if item is not None: self._field_coders[i].encode_to_stream(item, out_stream)
def _emit_output(self, output_result): for result in output_result: yield Row(None, None, None, result) for result in self._collector.buf: # 0: proc time timer data # 1: event time timer data # 2: normal data # result_row: [TIMER_FLAG, TIMER TYPE, TIMER_KEY, RESULT_DATA] yield Row(result[0], result[1], result[2], None) self._collector.clear()
def wrapped_func(value): if value[0]: result = co_flat_map_func.flat_map1(value[1]) if result: for result_val in result: yield Row(CoFlatMapFunctionOutputFlag.LEFT.value, result_val) yield Row(CoFlatMapFunctionOutputFlag.LEFT_END.value, None) else: result = co_flat_map_func.flat_map2(value[2]) if result: for result_val in result: yield Row(CoFlatMapFunctionOutputFlag.RIGHT.value, result_val) yield Row(CoFlatMapFunctionOutputFlag.RIGHT_END.value, None)
def _emit_results(timestamp, watermark, results, has_side_output): if results: if has_side_output: for result in results: if isinstance(result, tuple) and isinstance( result[0], OutputTag): yield cast(OutputTag, result[0]).tag_id, Row(timestamp, watermark, result[1]) else: yield DEFAULT_OUTPUT_TAG, Row(timestamp, watermark, result) else: for result in results: yield Row(timestamp, watermark, result)
def test_row_coder(self): from pyflink.common import Row, RowKind field_coder = BigIntCoder() field_count = 10 coder = RowCoder([field_coder for _ in range(field_count)]) v = Row(*[None if i % 2 == 0 else i for i in range(field_count)]) v.set_row_kind(RowKind.INSERT) self.check_coder(coder, v) v.set_row_kind(RowKind.UPDATE_BEFORE) self.check_coder(coder, v) v.set_row_kind(RowKind.UPDATE_AFTER) self.check_coder(coder, v) v.set_row_kind(RowKind.DELETE) self.check_coder(coder, v)
def test_row_coder(self): from pyflink.common import Row, RowKind field_coder = BigIntCoder() field_count = 10 field_names = ['f{}'.format(i) for i in range(field_count)] coder = RowCoder([field_coder for _ in range(field_count)], field_names) v = Row(**{field_names[i]: None if i % 2 == 0 else i for i in range(field_count)}) v.set_row_kind(RowKind.INSERT) self.check_coder(coder, v) v.set_row_kind(RowKind.UPDATE_BEFORE) self.check_coder(coder, v) v.set_row_kind(RowKind.UPDATE_AFTER) self.check_coder(coder, v) v.set_row_kind(RowKind.DELETE) self.check_coder(coder, v)
def test_map_with_pandas_udf(self): t = self.t_env.from_elements( [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)), (2, Row(3, 4))], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.ROW([DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("d", DataTypes.INT())]))])) sink_table_ddl = """ CREATE TABLE Results(a BIGINT, b BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) def func(x): import pandas as pd res = pd.concat([x.a, x.c + x.d], axis=1) return res def func2(x): return x * 2 def func3(x): assert isinstance(x, Row) return x pandas_udf = udf(func, result_type=DataTypes.ROW( [DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT())]), func_type='pandas') pandas_udf_2 = udf(func2, result_type=DataTypes.ROW( [DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT())]), func_type='pandas') general_udf = udf(func3, result_type=DataTypes.ROW( [DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT())])) t.map(pandas_udf).map(pandas_udf_2).map(general_udf).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals( actual, ["+I[4, 8]", "+I[2, 10]", "+I[2, 28]", "+I[2, 18]", "+I[4, 14]"])
def _encode_one_row_to_stream(self, value: Row, out_stream, nested): field_coders = self._field_coders self._write_mask(value, out_stream, value.get_row_kind().value) for i in range(self._field_count): item = value[i] if item is not None: field_coders[i].encode_to_stream(item, out_stream, nested)
def state_access_demo(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) env.set_runtime_mode(RuntimeExecutionMode.BATCH) seq_num_source = NumberSequenceSource(1, 10) output_path = '/opt/examples/datastream/output/state_access' file_sink = FileSink \ .for_row_format(output_path, Encoder.simple_string_encoder()) \ .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \ .build() ds = env.from_source( source=seq_num_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name='seq_num_source', type_info=Types.LONG()) ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \ .key_by(lambda a: a[0]) \ .map(MyMapFunction(), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \ .key_by(lambda a: a[0]) \ .process(MyKeyedProcessFunction(), Types.LONG()) \ .sink_to(file_sink) env.execute('11-data_stream_state_access')
def wrap_inputs_as_row(*args): from pyflink.common.types import Row import pandas as pd if type(args[0]) == pd.Series: return pd.concat(args, axis=1) else: return Row(*args)
def _create_orc_basic_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]: row_type = DataTypes.ROW([ DataTypes.FIELD('char', DataTypes.CHAR(10)), DataTypes.FIELD('varchar', DataTypes.VARCHAR(10)), DataTypes.FIELD('bytes', DataTypes.BYTES()), DataTypes.FIELD('boolean', DataTypes.BOOLEAN()), DataTypes.FIELD('decimal', DataTypes.DECIMAL(2, 0)), DataTypes.FIELD('int', DataTypes.INT()), DataTypes.FIELD('bigint', DataTypes.BIGINT()), DataTypes.FIELD('double', DataTypes.DOUBLE()), DataTypes.FIELD('date', DataTypes.DATE().bridged_to('java.sql.Date')), DataTypes.FIELD('timestamp', DataTypes.TIMESTAMP(3).bridged_to('java.sql.Timestamp')), ]) row_type_info = Types.ROW_NAMED( ['char', 'varchar', 'bytes', 'boolean', 'decimal', 'int', 'bigint', 'double', 'date', 'timestamp'], [Types.STRING(), Types.STRING(), Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.BOOLEAN(), Types.BIG_DEC(), Types.INT(), Types.LONG(), Types.DOUBLE(), Types.SQL_DATE(), Types.SQL_TIMESTAMP()] ) data = [Row( char='char', varchar='varchar', bytes=b'varbinary', boolean=True, decimal=Decimal(1.5), int=2147483647, bigint=-9223372036854775808, double=2e-308, date=date(1970, 1, 1), timestamp=datetime(1970, 1, 2, 3, 4, 5, 600000), )] return row_type, row_type_info, data
def test_aggregate_with_pandas_udaf_without_keys(self): t = self.t_env.from_elements( [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()) ])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.FLOAT(), DataTypes.INT()]) self.t_env.register_table_sink("Results", table_sink) pandas_udaf = udaf(lambda pd: Row(pd.b.mean(), pd.b.max()), result_type=DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT()) ]), func_type="pandas") t.select(t.b) \ .aggregate(pandas_udaf.alias("a", "b")) \ .select("a, b") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[3.8, 8]"])
def test_map(self): t = self.t_env.from_elements( [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()) ])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) func = udf(lambda x: Row(x + 1, x * x), result_type=DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.BIGINT()) ])) t.map(func(t.b)).alias("a", "b") \ .map(func(t.a)).alias("a", "b") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["4,9", "3,4", "7,36", "10,81", "5,16"])
def partition_custom_map(self, value): if self.num_partitions is None: self.num_partitions = int( os.environ[data_stream_num_partitions_env_key]) partition = partitioner.partition(key_selector.get_key(value), self.num_partitions) return Row(partition, value)
def join_row(left: Row, right: Row): fields = [] for value in left: fields.append(value) for value in right: fields.append(value) return Row(*fields)
def convert_to_python_obj(data, type_info): if type_info == Types.PICKLED_BYTE_ARRAY(): return pickle.loads(data) elif isinstance(type_info, ExternalTypeInfo): return convert_to_python_obj(data, type_info._type_info) else: gateway = get_gateway() pickle_bytes = gateway.jvm.PythonBridgeUtils. \ getPickledBytesFromJavaObject(data, type_info.get_java_type_info()) if isinstance(type_info, RowTypeInfo) or isinstance( type_info, TupleTypeInfo): field_data = zip(list(pickle_bytes[1:]), type_info.get_field_types()) fields = [] for data, field_type in field_data: if len(data) == 0: fields.append(None) else: fields.append( pickled_bytes_to_python_converter(data, field_type)) if isinstance(type_info, RowTypeInfo): return Row.of_kind( RowKind(int.from_bytes(pickle_bytes[0], 'little')), *fields) else: return tuple(fields) else: return pickled_bytes_to_python_converter(pickle_bytes, type_info)
def wrapped_keyed_process_function(value): if value[0] is not None: # it is timer data # VALUE: # TIMER_FLAG, TIMESTAMP_OF_TIMER, CURRENT_WATERMARK, CURRENT_KEY_OF_TIMER, None on_timer_ctx.set_timestamp(value[1]) on_timer_ctx.timer_service().set_current_watermark(value[2]) state_current_key = value[3] user_current_key = state_current_key[0] on_timer_ctx.set_current_key(user_current_key) keyed_state_backend.set_current_key(state_current_key) if value[ 0] == KeyedProcessFunctionInputFlag.EVENT_TIME_TIMER.value: on_timer_ctx.set_time_domain(TimeDomain.EVENT_TIME) elif value[ 0] == KeyedProcessFunctionInputFlag.PROC_TIME_TIMER.value: on_timer_ctx.set_time_domain(TimeDomain.PROCESSING_TIME) else: raise TypeError( "TimeCharacteristic[%s] is not supported." % str(value[0])) output_result = on_timer(value[1], on_timer_ctx) else: # it is normal data # VALUE: TIMER_FLAG, CURRENT_TIMESTAMP, CURRENT_WATERMARK, None, NORMAL_DATA # NORMAL_DATA: CURRENT_KEY, DATA ctx.set_timestamp(value[1]) ctx.timer_service().set_current_watermark(value[2]) user_current_key = value[4][0] state_current_key = Row(user_current_key) ctx.set_current_key(user_current_key) keyed_state_backend.set_current_key(state_current_key) output_result = process_element(value[4][1], ctx) if output_result: for result in output_result: yield Row(None, None, None, result) for result in collector.buf: # 0: proc time timer data # 1: event time timer data # 2: normal data # result_row: [TIMER_FLAG, TIMER TYPE, TIMER_KEY, RESULT_DATA] yield Row(result[0], result[1], result[2], None) collector.clear()
def test_mixed_with_built_in_functions_with_retract(self): self.t_env.get_config().get_configuration().set_string("parallelism.default", "1") self.t_env.create_temporary_system_function( "concat", ConcatAggregateFunction()) t = self.t_env.from_elements( [(1, 'Hi_', 1), (1, 'Hi', 2), (2, 'Hi_', 3), (2, 'Hi', 4), (3, None, None), (3, None, None), (4, 'hello2_', 7), (4, 'hello2', 8), (5, 'hello_', 9), (5, 'hello', 10)], ['a', 'b', 'c']) self.t_env.create_temporary_view("source", t) table_with_retract_message = self.t_env.sql_query( "select a, LAST_VALUE(b) as b, LAST_VALUE(c) as c from source group by a") self.t_env.create_temporary_view("retract_table", table_with_retract_message) result_table = self.t_env.sql_query( "select concat(b, ',') as a, " "FIRST_VALUE(b) as b, " "LAST_VALUE(b) as c, " "COUNT(c) as d, " "COUNT(1) as e, " "LISTAGG(b) as f," "LISTAGG(b, '|') as g," "MAX(c) as h," "MAX(cast(c as float) + 1) as i," "MIN(c) as j," "MIN(cast(c as decimal) + 1) as k," "SUM(c) as l," "SUM(cast(c as float) + 1) as m," "AVG(c) as n," "AVG(cast(c as double) + 1) as o," "STDDEV_POP(cast(c as float))," "STDDEV_SAMP(cast(c as float))," "VAR_POP(cast(c as float))," "VAR_SAMP(cast(c as float))" " from retract_table") result = [i for i in result_table.execute().collect()] expected = Row('Hi,Hi,hello,hello2', 'Hi', 'hello', 4, 5, 'Hi,Hi,hello2,hello', 'Hi|Hi|hello2|hello', 10, 11.0, 2, Decimal(3.0), 24, 28.0, 6, 7.0, 3.1622777, 3.6514838, 10.0, 13.333333) expected.set_row_kind(RowKind.UPDATE_AFTER) self.assertEqual(result[len(result) - 1], expected)