def test_fields(self): fields = collections.OrderedDict([ ("int_field", DataTypes.INT()), ("long_field", DataTypes.BIGINT()), ("string_field", DataTypes.STRING()), ("timestamp_field", DataTypes.TIMESTAMP(3)), ("time_field", DataTypes.TIME()), ("date_field", DataTypes.DATE()), ("double_field", DataTypes.DOUBLE()), ("float_field", DataTypes.FLOAT()), ("byte_field", DataTypes.TINYINT()), ("short_field", DataTypes.SMALLINT()), ("boolean_field", DataTypes.BOOLEAN()) ]) schema = Schema().fields(fields) properties = schema.to_properties() expected = { 'schema.0.name': 'int_field', 'schema.0.data-type': 'INT', 'schema.1.name': 'long_field', 'schema.1.data-type': 'BIGINT', 'schema.2.name': 'string_field', 'schema.2.data-type': 'VARCHAR(2147483647)', 'schema.3.name': 'timestamp_field', 'schema.3.data-type': 'TIMESTAMP(3)', 'schema.4.name': 'time_field', 'schema.4.data-type': 'TIME(0)', 'schema.5.name': 'date_field', 'schema.5.data-type': 'DATE', 'schema.6.name': 'double_field', 'schema.6.data-type': 'DOUBLE', 'schema.7.name': 'float_field', 'schema.7.data-type': 'FLOAT', 'schema.8.name': 'byte_field', 'schema.8.data-type': 'TINYINT', 'schema.9.name': 'short_field', 'schema.9.data-type': 'SMALLINT', 'schema.10.name': 'boolean_field', 'schema.10.data-type': 'BOOLEAN' } self.assertEqual(expected, properties) if sys.version_info[:2] <= (3, 5): fields = { "int_field": DataTypes.INT(), "long_field": DataTypes.BIGINT(), "string_field": DataTypes.STRING(), "timestamp_field": DataTypes.TIMESTAMP(3), "time_field": DataTypes.TIME(), "date_field": DataTypes.DATE(), "double_field": DataTypes.DOUBLE(), "float_field": DataTypes.FLOAT(), "byte_field": DataTypes.TINYINT(), "short_field": DataTypes.SMALLINT(), "boolean_field": DataTypes.BOOLEAN() } self.assertRaises(TypeError, Schema().fields, fields)
def sql_type(cls): return DataTypes.ARRAY(DataTypes.DOUBLE(False))
def test_decimal_type(self): t1 = DataTypes.DECIMAL(10, 0) t2 = DataTypes.DECIMAL(10, 2) self.assertTrue(t2 is not t1) self.assertNotEqual(t1, t2)
def test_array_type(self): # nullable/not_null flag will be lost during the conversion. test_types = [ DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING())) ] java_types = [_to_java_type(item) for item in test_types] converted_python_types = [_from_java_type(item) for item in java_types] self.assertEqual(test_types, converted_python_types)
def test_struct_type(self): row1 = DataTypes.ROW().add("f1", DataTypes.STRING(nullable=True)) \ .add("f2", DataTypes.STRING(nullable=True)) row2 = DataTypes.ROW([ DataTypes.FIELD("f1", DataTypes.STRING(nullable=True)), DataTypes.FIELD("f2", DataTypes.STRING(nullable=True), None) ]) self.assertEqual(row1.field_names(), row2.names) self.assertEqual(row1, row2) row1 = DataTypes.ROW().add("f1", DataTypes.STRING(nullable=True)) \ .add("f2", DataTypes.STRING(nullable=True)) row2 = DataTypes.ROW( [DataTypes.FIELD("f1", DataTypes.STRING(nullable=True))]) self.assertNotEqual(row1.field_names(), row2.names) self.assertNotEqual(row1, row2) row1 = (DataTypes.ROW().add( DataTypes.FIELD("f1", DataTypes.STRING(nullable=True))).add( "f2", DataTypes.STRING(nullable=True))) row2 = DataTypes.ROW([ DataTypes.FIELD("f1", DataTypes.STRING(nullable=True)), DataTypes.FIELD("f2", DataTypes.STRING(nullable=True)) ]) self.assertEqual(row1.field_names(), row2.names) self.assertEqual(row1, row2) row1 = (DataTypes.ROW().add( DataTypes.FIELD("f1", DataTypes.STRING(nullable=True))).add( "f2", DataTypes.STRING(nullable=True))) row2 = DataTypes.ROW( [DataTypes.FIELD("f1", DataTypes.STRING(nullable=True))]) self.assertNotEqual(row1.field_names(), row2.names) self.assertNotEqual(row1, row2) # Catch exception raised during improper construction self.assertRaises(ValueError, lambda: DataTypes.ROW().add("name")) row1 = DataTypes.ROW().add("f1", DataTypes.STRING(nullable=True)) \ .add("f2", DataTypes.STRING(nullable=True)) for field in row1: self.assertIsInstance(field, RowField) row1 = DataTypes.ROW().add("f1", DataTypes.STRING(nullable=True)) \ .add("f2", DataTypes.STRING(nullable=True)) self.assertEqual(len(row1), 2) row1 = DataTypes.ROW().add("f1", DataTypes.STRING(nullable=True)) \ .add("f2", DataTypes.STRING(nullable=True)) self.assertIs(row1["f1"], row1.fields[0]) self.assertIs(row1[0], row1.fields[0]) self.assertEqual(row1[0:1], DataTypes.ROW(row1.fields[0:1])) self.assertRaises(KeyError, lambda: row1["f9"]) self.assertRaises(IndexError, lambda: row1[9]) self.assertRaises(TypeError, lambda: row1[9.9])
def test_group_aggregate_with_aux_group(self): t = self.t_env.from_elements( [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()) ])) table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c', 'd'], [ DataTypes.TINYINT(), DataTypes.INT(), DataTypes.FLOAT(), DataTypes.INT() ]) self.t_env.register_table_sink("Results", table_sink) self.t_env.get_config().get_configuration().set_string( 'python.metric.enabled', 'true') self.t_env.register_function( "max_add", udaf(MaxAdd(), result_type=DataTypes.INT(), func_type="pandas")) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) t.group_by("a") \ .select("a, a + 1 as b, a + 2 as c") \ .group_by("a, b") \ .select("a, b, mean_udaf(b), max_add(b, c, 1)") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["1,2,2.0,6", "2,3,3.0,8", "3,4,4.0,10"])
def test_map_with_pandas_udf(self): t = self.t_env.from_elements( [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)), (2, Row(3, 4))], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD("b", DataTypes.INT())]))])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) def func(x, y): import pandas as pd a = (x * 2).rename('b') res = pd.concat([a, x], axis=1) + y return res pandas_udf = udf(func, result_type=DataTypes.ROW( [DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT())]), func_type='pandas') t.map(pandas_udf(t.a, t.b)).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["3,5", "3,7", "6,6", "9,8", "5,8"])
def get_result_type(self) -> DataType: return DataTypes.FLOAT()
def test_from_pandas_with_incorrect_schema(self): fields = self.data_type.fields.copy() fields[0], fields[7] = fields[7], fields[0] # swap str with tinyint wrong_schema = DataTypes.ROW(fields) # should be DataTypes.STRING() with self.assertRaisesRegex(Exception, "Expected a string.*got int8"): self.t_env.from_pandas(self.pdf, schema=wrong_schema)
def get_result_type(self): return DataTypes.BIGINT()
def get_accumulator_type(self) -> DataType: return DataTypes.ARRAY(DataTypes.BIGINT())
def test_basic_type(self): test_types = [ DataTypes.STRING(), DataTypes.BOOLEAN(), DataTypes.BYTES(), DataTypes.TINYINT(), DataTypes.SMALLINT(), DataTypes.INT(), DataTypes.BIGINT(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.DATE(), DataTypes.TIME(), DataTypes.TIMESTAMP(3) ] java_types = [_to_java_type(item) for item in test_types] converted_python_types = [_from_java_type(item) for item in java_types] self.assertEqual(test_types, converted_python_types)
def test_verify_type_not_nullable(self): import array import datetime import decimal schema = DataTypes.ROW([ DataTypes.FIELD('s', DataTypes.STRING(nullable=False)), DataTypes.FIELD('i', DataTypes.INT(True)) ]) class MyObj: def __init__(self, **kwargs): for k, v in kwargs.items(): setattr(self, k, v) # obj, data_type success_spec = [ # String ("", DataTypes.STRING()), (u"", DataTypes.STRING()), # UDT (ExamplePoint(1.0, 2.0), ExamplePointUDT()), # Boolean (True, DataTypes.BOOLEAN()), # TinyInt (-(2**7), DataTypes.TINYINT()), (2**7 - 1, DataTypes.TINYINT()), # SmallInt (-(2**15), DataTypes.SMALLINT()), (2**15 - 1, DataTypes.SMALLINT()), # Int (-(2**31), DataTypes.INT()), (2**31 - 1, DataTypes.INT()), # BigInt (2**64, DataTypes.BIGINT()), # Float & Double (1.0, DataTypes.FLOAT()), (1.0, DataTypes.DOUBLE()), # Decimal (decimal.Decimal("1.0"), DataTypes.DECIMAL(10, 0)), # Binary (bytearray([1]), DataTypes.BINARY(1)), # Date/Time/Timestamp (datetime.date(2000, 1, 2), DataTypes.DATE()), (datetime.datetime(2000, 1, 2, 3, 4), DataTypes.DATE()), (datetime.time(1, 1, 2), DataTypes.TIME()), (datetime.datetime(2000, 1, 2, 3, 4), DataTypes.TIMESTAMP()), # Array ([], DataTypes.ARRAY(DataTypes.INT())), (["1", None], DataTypes.ARRAY(DataTypes.STRING(nullable=True))), ([1, 2], DataTypes.ARRAY(DataTypes.INT())), ((1, 2), DataTypes.ARRAY(DataTypes.INT())), (array.array('h', [1, 2]), DataTypes.ARRAY(DataTypes.INT())), # Map ({}, DataTypes.MAP(DataTypes.STRING(), DataTypes.INT())), ({ "a": 1 }, DataTypes.MAP(DataTypes.STRING(), DataTypes.INT())), ({ "a": None }, DataTypes.MAP(DataTypes.STRING(nullable=False), DataTypes.INT(True))), # Struct ({ "s": "a", "i": 1 }, schema), ({ "s": "a", "i": None }, schema), ({ "s": "a" }, schema), ({ "s": "a", "f": 1.0 }, schema), (Row(s="a", i=1), schema), (Row(s="a", i=None), schema), (Row(s="a", i=1, f=1.0), schema), (["a", 1], schema), (["a", None], schema), (("a", 1), schema), (MyObj(s="a", i=1), schema), (MyObj(s="a", i=None), schema), (MyObj(s="a"), schema), ] # obj, data_type, exception class failure_spec = [ # Char/VarChar (match anything but None) (None, DataTypes.VARCHAR(1), ValueError), (None, DataTypes.CHAR(1), ValueError), # VarChar (length exceeds maximum length) ("abc", DataTypes.VARCHAR(1), ValueError), # Char (length exceeds length) ("abc", DataTypes.CHAR(1), ValueError), # UDT (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError), # Boolean (1, DataTypes.BOOLEAN(), TypeError), ("True", DataTypes.BOOLEAN(), TypeError), ([1], DataTypes.BOOLEAN(), TypeError), # TinyInt (-(2**7) - 1, DataTypes.TINYINT(), ValueError), (2**7, DataTypes.TINYINT(), ValueError), ("1", DataTypes.TINYINT(), TypeError), (1.0, DataTypes.TINYINT(), TypeError), # SmallInt (-(2**15) - 1, DataTypes.SMALLINT(), ValueError), (2**15, DataTypes.SMALLINT(), ValueError), # Int (-(2**31) - 1, DataTypes.INT(), ValueError), (2**31, DataTypes.INT(), ValueError), # Float & Double (1, DataTypes.FLOAT(), TypeError), (1, DataTypes.DOUBLE(), TypeError), # Decimal (1.0, DataTypes.DECIMAL(10, 0), TypeError), (1, DataTypes.DECIMAL(10, 0), TypeError), ("1.0", DataTypes.DECIMAL(10, 0), TypeError), # Binary (1, DataTypes.BINARY(1), TypeError), # VarBinary (length exceeds maximum length) (bytearray([1, 2]), DataTypes.VARBINARY(1), ValueError), # Char (length exceeds length) (bytearray([1, 2]), DataTypes.BINARY(1), ValueError), # Date/Time/Timestamp ("2000-01-02", DataTypes.DATE(), TypeError), ("10:01:02", DataTypes.TIME(), TypeError), (946811040, DataTypes.TIMESTAMP(), TypeError), # Array (["1", None], DataTypes.ARRAY(DataTypes.VARCHAR(1, nullable=False)), ValueError), ([1, "2"], DataTypes.ARRAY(DataTypes.INT()), TypeError), # Map ({ "a": 1 }, DataTypes.MAP(DataTypes.INT(), DataTypes.INT()), TypeError), ({ "a": "1" }, DataTypes.MAP(DataTypes.VARCHAR(1), DataTypes.INT()), TypeError), ({ "a": None }, DataTypes.MAP(DataTypes.VARCHAR(1), DataTypes.INT(False)), ValueError), # Struct ({ "s": "a", "i": "1" }, schema, TypeError), (Row(s="a"), schema, ValueError), # Row can't have missing field (Row(s="a", i="1"), schema, TypeError), (["a"], schema, ValueError), (["a", "1"], schema, TypeError), (MyObj(s="a", i="1"), schema, TypeError), (MyObj(s=None, i="1"), schema, ValueError), ] # Check success cases for obj, data_type in success_spec: try: _create_type_verifier(data_type.not_null())(obj) except (TypeError, ValueError): self.fail("verify_type(%s, %s, nullable=False)" % (obj, data_type)) # Check failure cases for obj, data_type, exp in failure_spec: msg = "verify_type(%s, %s, nullable=False) == %s" % ( obj, data_type, exp) with self.assertRaises(exp, msg=msg): _create_type_verifier(data_type.not_null())(obj)
def test_timestamp_microsecond(self): tst = DataTypes.TIMESTAMP() self.assertEqual( tst.to_sql_type(datetime.datetime.max) % 1000000, 999999)
def test_sliding_group_window_over_time(self): # create source file path import tempfile import os tmp_dir = tempfile.gettempdir() data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_sliding_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') from pyflink.table.window import Slide self.env.set_stream_time_characteristic(TimeCharacteristic.EventTime) self.t_env.register_function("mean_udaf", mean_udaf) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c', 'd'], [ DataTypes.TINYINT(), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.FLOAT() ]) self.t_env.register_table_sink("Results", table_sink) t.window(Slide.over("1.hours").every("30.minutes").on("rowtime").alias("w")) \ .group_by("a, b, w") \ .select("a, w.start, w.end, mean_udaf(c) as b") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "1,2018-03-11 02:30:00.0,2018-03-11 03:30:00.0,2.0", "1,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.5", "1,2018-03-11 03:30:00.0,2018-03-11 04:30:00.0,5.5", "1,2018-03-11 04:00:00.0,2018-03-11 05:00:00.0,8.0", "2,2018-03-11 02:30:00.0,2018-03-11 03:30:00.0,1.0", "2,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.0", "2,2018-03-11 03:30:00.0,2018-03-11 04:30:00.0,3.0", "3,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.0", "3,2018-03-11 02:30:00.0,2018-03-11 03:30:00.0,2.0" ]) os.remove(source_path)
def setUpClass(cls): super(PandasConversionTestBase, cls).setUpClass() cls.data = [ (1, 1, 1, 1, True, 1.1, 1.2, 'hello', bytearray(b"aaa"), decimal.Decimal('1000000000000000000.01'), datetime.date(2014, 9, 13), datetime.time(hour=1, minute=0, second=1), datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), ['hello', '中文'], Row(a=1, b='hello', c=datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), d=[1, 2])), (1, 2, 2, 2, False, 2.1, 2.2, 'world', bytearray(b"bbb"), decimal.Decimal('1000000000000000000.02'), datetime.date(2014, 9, 13), datetime.time(hour=1, minute=0, second=1), datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), ['hello', '中文'], Row(a=1, b='hello', c=datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), d=[1, 2])) ] cls.data_type = DataTypes.ROW([ DataTypes.FIELD("f1", DataTypes.TINYINT()), DataTypes.FIELD("f2", DataTypes.SMALLINT()), DataTypes.FIELD("f3", DataTypes.INT()), DataTypes.FIELD("f4", DataTypes.BIGINT()), DataTypes.FIELD("f5", DataTypes.BOOLEAN()), DataTypes.FIELD("f6", DataTypes.FLOAT()), DataTypes.FIELD("f7", DataTypes.DOUBLE()), DataTypes.FIELD("f8", DataTypes.STRING()), DataTypes.FIELD("f9", DataTypes.BYTES()), DataTypes.FIELD("f10", DataTypes.DECIMAL(38, 18)), DataTypes.FIELD("f11", DataTypes.DATE()), DataTypes.FIELD("f12", DataTypes.TIME()), DataTypes.FIELD("f13", DataTypes.TIMESTAMP(3)), DataTypes.FIELD("f14", DataTypes.ARRAY(DataTypes.STRING())), DataTypes.FIELD( "f15", DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD("b", DataTypes.STRING()), DataTypes.FIELD("c", DataTypes.TIMESTAMP(3)), DataTypes.FIELD("d", DataTypes.ARRAY(DataTypes.INT())) ])) ], False) cls.pdf = cls.create_pandas_data_frame()
def test_group_aggregate_function(self): t = self.t_env.from_elements( [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()) ])) table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c'], [ DataTypes.TINYINT(), DataTypes.FLOAT(), DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD("b", DataTypes.INT()) ]) ]) self.t_env.register_table_sink("Results", table_sink) # general udf add = udf(lambda a: a + 1, result_type=DataTypes.INT()) # pandas udf substract = udf(lambda a: a - 1, result_type=DataTypes.INT(), func_type="pandas") max_udaf = udaf(lambda a: (a.max(), a.min()), result_type=DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD("b", DataTypes.INT()) ]), func_type="pandas") t.group_by("a") \ .select(t.a, mean_udaf(add(t.b)), max_udaf(substract(t.c))) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["1,6.0,5,2", "2,3.0,3,2", "3,3.0,2,2"])
def test_register_table_source_and_sink(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] self.prepare_csv_source(source_path, data, field_types, field_names) sink_path = os.path.join(self.tempdir + '/streaming2.csv') if os.path.isfile(sink_path): os.remove(sink_path) t_env = self.t_env t_env.connect(FileSystem().path(source_path))\ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .register_table_source_and_sink("source") t_env.connect(FileSystem().path(sink_path))\ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .register_table_source_and_sink("sink") t_env.scan("source") \ .select("a + 1, b, c") \ .insert_into("sink") self.env.execute() with open(sink_path, 'r') as f: lines = f.read() assert lines == '2,Hi,Hello\n' + "3,Hello,Hello\n"
def test_map(self): t = self.t_env.from_elements( [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) func = udf(lambda x: Row(x + 1, x * x), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.BIGINT())])) t.map(func(t.b)).alias("a", "b") \ .map(func(t.a)).alias("a", "b") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["4,9", "3,4", "7,36", "10,81", "5,16"])
def test_field(self): schema = Schema()\ .field("int_field", DataTypes.INT())\ .field("long_field", DataTypes.BIGINT())\ .field("string_field", DataTypes.STRING())\ .field("timestamp_field", DataTypes.TIMESTAMP())\ .field("time_field", DataTypes.TIME())\ .field("date_field", DataTypes.DATE())\ .field("double_field", DataTypes.DOUBLE())\ .field("float_field", DataTypes.FLOAT())\ .field("byte_field", DataTypes.TINYINT())\ .field("short_field", DataTypes.SMALLINT())\ .field("boolean_field", DataTypes.BOOLEAN()) properties = schema.to_properties() expected = { 'schema.0.name': 'int_field', 'schema.0.type': 'INT', 'schema.1.name': 'long_field', 'schema.1.type': 'BIGINT', 'schema.2.name': 'string_field', 'schema.2.type': 'VARCHAR', 'schema.3.name': 'timestamp_field', 'schema.3.type': 'TIMESTAMP', 'schema.4.name': 'time_field', 'schema.4.type': 'TIME', 'schema.5.name': 'date_field', 'schema.5.type': 'DATE', 'schema.6.name': 'double_field', 'schema.6.type': 'DOUBLE', 'schema.7.name': 'float_field', 'schema.7.type': 'FLOAT', 'schema.8.name': 'byte_field', 'schema.8.type': 'TINYINT', 'schema.9.name': 'short_field', 'schema.9.type': 'SMALLINT', 'schema.10.name': 'boolean_field', 'schema.10.type': 'BOOLEAN' } self.assertEqual(expected, properties)
def test_flat_map(self): t = self.t_env.from_elements( [(1, "2,3", 3), (2, "1", 3), (1, "5,6,7", 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.STRING()), DataTypes.FIELD("c", DataTypes.INT())])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.STRING()]) self.t_env.register_table_sink("Results", table_sink) @udtf(result_types=[DataTypes.INT(), DataTypes.STRING()]) def split(x, string): for s in string.split(","): yield x, s t.flat_map(split(t.a, t.b)) \ .alias("a, b") \ .flat_map(split(t.a, t.b)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["1,2", "1,3", "2,1", "1,5", "1,6", "1,7"])
mean_udaf(b) over (PARTITION BY a ORDER BY proctime ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), max_add_min_udaf(b) over (PARTITION BY a ORDER BY proctime ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) from source_table """).wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "1,1.0,2", "1,3.0,6", "1,6.5,13", "2,1.0,2", "2,2.0,4", "3,2.0,4" ]) os.remove(source_path) @udaf(result_type=DataTypes.FLOAT(), func_type="pandas") def mean_udaf(v): return v.mean() class MaxAdd(AggregateFunction, unittest.TestCase): def open(self, function_context): mg = function_context.get_metric_group() self.counter = mg.add_group("key", "value").counter("my_counter") self.counter_sum = 0 def get_value(self, accumulator): # counter self.counter.inc(10) self.counter_sum += 10 return accumulator[0]
def test_map_type(self): test_types = [ DataTypes.MAP(DataTypes.BIGINT(), DataTypes.BIGINT()), DataTypes.MAP(DataTypes.STRING(), DataTypes.STRING()), DataTypes.MAP( DataTypes.STRING(), DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.MAP( DataTypes.STRING(), DataTypes.MAP(DataTypes.STRING(), DataTypes.STRING())) ] java_types = [_to_java_type(item) for item in test_types] converted_python_types = [_from_java_type(item) for item in java_types] self.assertEqual(test_types, converted_python_types)
def test_tumble_group_window_aggregate_function(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [(1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3)) ])) table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c'], [ DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.FLOAT() ]) self.t_env.register_table_sink("Results", table_sink) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) tumble_window = Tumble.over(expr.lit(1).hours) \ .on(expr.col("rowtime")) \ .alias("w") t.window(tumble_window) \ .group_by("w") \ .select("w.start, w.end, mean_udaf(b)") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.2", "2018-03-11 04:00:00.0,2018-03-11 05:00:00.0,8.0" ])
def test_merge_type(self): self.assertEqual(_merge_type(DataTypes.BIGINT(), DataTypes.NULL()), DataTypes.BIGINT()) self.assertEqual(_merge_type(DataTypes.NULL(), DataTypes.BIGINT()), DataTypes.BIGINT()) self.assertEqual(_merge_type(DataTypes.BIGINT(), DataTypes.BIGINT()), DataTypes.BIGINT()) self.assertEqual( _merge_type(DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.ARRAY(DataTypes.BIGINT())) with self.assertRaises(TypeError): _merge_type(DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.ARRAY(DataTypes.DOUBLE())) self.assertEqual( _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()), DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())) with self.assertRaises(TypeError): _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()), DataTypes.MAP(DataTypes.DOUBLE(), DataTypes.BIGINT())) with self.assertRaises(TypeError): _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()), DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE())) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.BIGINT()), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.BIGINT()), DataTypes.FIELD('f2', DataTypes.STRING()) ])), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.BIGINT()), DataTypes.FIELD('f2', DataTypes.STRING()) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.BIGINT()), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.DOUBLE()), DataTypes.FIELD('f2', DataTypes.STRING()) ])) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW( [DataTypes.FIELD('f2', DataTypes.BIGINT())])) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW( [DataTypes.FIELD('f2', DataTypes.BIGINT())])) ])), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW([DataTypes.FIELD('f2', DataTypes.BIGINT())])) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW( [DataTypes.FIELD('f2', DataTypes.BIGINT())])) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW( [DataTypes.FIELD('f2', DataTypes.STRING())])) ])) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ])), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.DOUBLE())), DataTypes.FIELD('f2', DataTypes.STRING()) ])) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ])), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE())), DataTypes.FIELD('f2', DataTypes.STRING()) ])) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))) ])), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.DOUBLE(), DataTypes.BIGINT()))) ]))
def test_slide_group_window_aggregate_function(self): import datetime from pyflink.table.window import Slide t = self.t_env.from_elements( [(1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3)) ])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd', 'e'], [ DataTypes.TINYINT(), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.FLOAT(), DataTypes.INT() ]) self.t_env.register_table_sink("Results", table_sink) self.t_env.register_function( "max_add", udaf(MaxAdd(), result_type=DataTypes.INT(), func_type="pandas")) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) slide_window = Slide.over(expr.lit(1).hours) \ .every(expr.lit(30).minutes) \ .on(expr.col("rowtime")) \ .alias("w") t.window(slide_window) \ .group_by("a, w") \ .select("a, w.start, w.end, mean_udaf(b), max_add(b, c, 1)") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "1,2018-03-11 02:30:00.0,2018-03-11 03:30:00.0,2.0,6", "1,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.5,7", "1,2018-03-11 03:30:00.0,2018-03-11 04:30:00.0,5.5,14", "1,2018-03-11 04:00:00.0,2018-03-11 05:00:00.0,8.0,14", "2,2018-03-11 02:30:00.0,2018-03-11 03:30:00.0,1.0,4", "2,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.0,10", "2,2018-03-11 03:30:00.0,2018-03-11 04:30:00.0,3.0,10", "3,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.0,7", "3,2018-03-11 02:30:00.0,2018-03-11 03:30:00.0,2.0,7" ])
def test_data_type_eq(self): lt = DataTypes.BIGINT() lt2 = pickle.loads(pickle.dumps(DataTypes.BIGINT())) self.assertEqual(lt, lt2)
def test_over_window_aggregate_function(self): import datetime t = self.t_env.from_elements( [(1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 1, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3)) ])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], [ DataTypes.TINYINT(), DataTypes.FLOAT(), DataTypes.INT(), DataTypes.FLOAT(), DataTypes.FLOAT(), DataTypes.FLOAT(), DataTypes.FLOAT(), DataTypes.FLOAT(), DataTypes.FLOAT(), DataTypes.FLOAT() ]) self.t_env.register_table_sink("Results", table_sink) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) self.t_env.register_function( "max_add", udaf(MaxAdd(), result_type=DataTypes.INT(), func_type="pandas")) self.t_env.register_table("T", t) self.t_env.execute_sql(""" insert into Results select a, mean_udaf(b) over (PARTITION BY a ORDER BY rowtime ROWS BETWEEN UNBOUNDED preceding AND UNBOUNDED FOLLOWING), max_add(b, c) over (PARTITION BY a ORDER BY rowtime ROWS BETWEEN UNBOUNDED preceding AND 0 FOLLOWING), mean_udaf(b) over (PARTITION BY a ORDER BY rowtime ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING), mean_udaf(c) over (PARTITION BY a ORDER BY rowtime ROWS BETWEEN 1 PRECEDING AND 0 FOLLOWING), mean_udaf(c) over (PARTITION BY a ORDER BY rowtime RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), mean_udaf(b) over (PARTITION BY a ORDER BY rowtime RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), mean_udaf(b) over (PARTITION BY a ORDER BY rowtime RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND UNBOUNDED FOLLOWING), mean_udaf(c) over (PARTITION BY a ORDER BY rowtime RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND UNBOUNDED FOLLOWING), mean_udaf(c) over (PARTITION BY a ORDER BY rowtime RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND CURRENT ROW) from T """).wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "1,4.3333335,5,4.3333335,3.0,3.0,2.5,4.3333335,3.0,2.0", "1,4.3333335,13,5.5,3.0,3.0,4.3333335,8.0,5.0,5.0", "1,4.3333335,6,4.3333335,2.0,3.0,2.5,4.3333335,3.0,2.0", "2,2.0,9,2.0,4.0,4.0,2.0,2.0,4.0,4.0", "2,2.0,3,2.0,2.0,4.0,1.0,2.0,4.0,2.0", "3,2.0,3,2.0,1.0,1.0,2.0,2.0,1.0,1.0" ])
def test_datetype_equal_zero(self): dt = DataTypes.DATE() self.assertEqual(dt.from_sql_type(0), datetime.date(1970, 1, 1))
def test_array_type(self): test_types = [ DataTypes.ARRAY(DataTypes.BIGINT()), # array type with not null basic data type means primitive array DataTypes.ARRAY(DataTypes.BIGINT().not_null()), DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING())) ] java_types = [_to_java_type(item) for item in test_types] converted_python_types = [_from_java_type(item) for item in java_types] self.assertEqual(test_types, converted_python_types)