예제 #1
0
    def test_fields(self):
        fields = collections.OrderedDict([
            ("int_field", DataTypes.INT()), ("long_field", DataTypes.BIGINT()),
            ("string_field", DataTypes.STRING()),
            ("timestamp_field", DataTypes.TIMESTAMP(3)),
            ("time_field", DataTypes.TIME()), ("date_field", DataTypes.DATE()),
            ("double_field", DataTypes.DOUBLE()),
            ("float_field", DataTypes.FLOAT()),
            ("byte_field", DataTypes.TINYINT()),
            ("short_field", DataTypes.SMALLINT()),
            ("boolean_field", DataTypes.BOOLEAN())
        ])

        schema = Schema().fields(fields)

        properties = schema.to_properties()
        expected = {
            'schema.0.name': 'int_field',
            'schema.0.data-type': 'INT',
            'schema.1.name': 'long_field',
            'schema.1.data-type': 'BIGINT',
            'schema.2.name': 'string_field',
            'schema.2.data-type': 'VARCHAR(2147483647)',
            'schema.3.name': 'timestamp_field',
            'schema.3.data-type': 'TIMESTAMP(3)',
            'schema.4.name': 'time_field',
            'schema.4.data-type': 'TIME(0)',
            'schema.5.name': 'date_field',
            'schema.5.data-type': 'DATE',
            'schema.6.name': 'double_field',
            'schema.6.data-type': 'DOUBLE',
            'schema.7.name': 'float_field',
            'schema.7.data-type': 'FLOAT',
            'schema.8.name': 'byte_field',
            'schema.8.data-type': 'TINYINT',
            'schema.9.name': 'short_field',
            'schema.9.data-type': 'SMALLINT',
            'schema.10.name': 'boolean_field',
            'schema.10.data-type': 'BOOLEAN'
        }
        self.assertEqual(expected, properties)

        if sys.version_info[:2] <= (3, 5):
            fields = {
                "int_field": DataTypes.INT(),
                "long_field": DataTypes.BIGINT(),
                "string_field": DataTypes.STRING(),
                "timestamp_field": DataTypes.TIMESTAMP(3),
                "time_field": DataTypes.TIME(),
                "date_field": DataTypes.DATE(),
                "double_field": DataTypes.DOUBLE(),
                "float_field": DataTypes.FLOAT(),
                "byte_field": DataTypes.TINYINT(),
                "short_field": DataTypes.SMALLINT(),
                "boolean_field": DataTypes.BOOLEAN()
            }
            self.assertRaises(TypeError, Schema().fields, fields)
예제 #2
0
 def sql_type(cls):
     return DataTypes.ARRAY(DataTypes.DOUBLE(False))
예제 #3
0
 def test_decimal_type(self):
     t1 = DataTypes.DECIMAL(10, 0)
     t2 = DataTypes.DECIMAL(10, 2)
     self.assertTrue(t2 is not t1)
     self.assertNotEqual(t1, t2)
예제 #4
0
    def test_array_type(self):
        # nullable/not_null flag will be lost during the conversion.
        test_types = [
            DataTypes.ARRAY(DataTypes.BIGINT()),
            DataTypes.ARRAY(DataTypes.BIGINT()),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT())),
            DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))
        ]

        java_types = [_to_java_type(item) for item in test_types]

        converted_python_types = [_from_java_type(item) for item in java_types]

        self.assertEqual(test_types, converted_python_types)
예제 #5
0
    def test_struct_type(self):
        row1 = DataTypes.ROW().add("f1", DataTypes.STRING(nullable=True)) \
            .add("f2", DataTypes.STRING(nullable=True))
        row2 = DataTypes.ROW([
            DataTypes.FIELD("f1", DataTypes.STRING(nullable=True)),
            DataTypes.FIELD("f2", DataTypes.STRING(nullable=True), None)
        ])
        self.assertEqual(row1.field_names(), row2.names)
        self.assertEqual(row1, row2)

        row1 = DataTypes.ROW().add("f1", DataTypes.STRING(nullable=True)) \
            .add("f2", DataTypes.STRING(nullable=True))
        row2 = DataTypes.ROW(
            [DataTypes.FIELD("f1", DataTypes.STRING(nullable=True))])
        self.assertNotEqual(row1.field_names(), row2.names)
        self.assertNotEqual(row1, row2)

        row1 = (DataTypes.ROW().add(
            DataTypes.FIELD("f1", DataTypes.STRING(nullable=True))).add(
                "f2", DataTypes.STRING(nullable=True)))
        row2 = DataTypes.ROW([
            DataTypes.FIELD("f1", DataTypes.STRING(nullable=True)),
            DataTypes.FIELD("f2", DataTypes.STRING(nullable=True))
        ])
        self.assertEqual(row1.field_names(), row2.names)
        self.assertEqual(row1, row2)

        row1 = (DataTypes.ROW().add(
            DataTypes.FIELD("f1", DataTypes.STRING(nullable=True))).add(
                "f2", DataTypes.STRING(nullable=True)))
        row2 = DataTypes.ROW(
            [DataTypes.FIELD("f1", DataTypes.STRING(nullable=True))])
        self.assertNotEqual(row1.field_names(), row2.names)
        self.assertNotEqual(row1, row2)

        # Catch exception raised during improper construction
        self.assertRaises(ValueError, lambda: DataTypes.ROW().add("name"))

        row1 = DataTypes.ROW().add("f1", DataTypes.STRING(nullable=True)) \
            .add("f2", DataTypes.STRING(nullable=True))
        for field in row1:
            self.assertIsInstance(field, RowField)

        row1 = DataTypes.ROW().add("f1", DataTypes.STRING(nullable=True)) \
            .add("f2", DataTypes.STRING(nullable=True))
        self.assertEqual(len(row1), 2)

        row1 = DataTypes.ROW().add("f1", DataTypes.STRING(nullable=True)) \
            .add("f2", DataTypes.STRING(nullable=True))
        self.assertIs(row1["f1"], row1.fields[0])
        self.assertIs(row1[0], row1.fields[0])
        self.assertEqual(row1[0:1], DataTypes.ROW(row1.fields[0:1]))
        self.assertRaises(KeyError, lambda: row1["f9"])
        self.assertRaises(IndexError, lambda: row1[9])
        self.assertRaises(TypeError, lambda: row1[9.9])
예제 #6
0
    def test_group_aggregate_with_aux_group(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT())
            ]))

        table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c', 'd'], [
            DataTypes.TINYINT(),
            DataTypes.INT(),
            DataTypes.FLOAT(),
            DataTypes.INT()
        ])
        self.t_env.register_table_sink("Results", table_sink)
        self.t_env.get_config().get_configuration().set_string(
            'python.metric.enabled', 'true')
        self.t_env.register_function(
            "max_add",
            udaf(MaxAdd(), result_type=DataTypes.INT(), func_type="pandas"))
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        t.group_by("a") \
            .select("a, a + 1 as b, a + 2 as c") \
            .group_by("a, b") \
            .select("a, b, mean_udaf(b), max_add(b, c, 1)") \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,2,2.0,6", "2,3,3.0,8", "3,4,4.0,10"])
예제 #7
0
    def test_map_with_pandas_udf(self):
        t = self.t_env.from_elements(
            [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)), (2, Row(3, 4))],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b",
                                 DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT()),
                                                DataTypes.FIELD("b", DataTypes.INT())]))]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        def func(x, y):
            import pandas as pd
            a = (x * 2).rename('b')
            res = pd.concat([a, x], axis=1) + y
            return res

        pandas_udf = udf(func,
                         result_type=DataTypes.ROW(
                             [DataTypes.FIELD("c", DataTypes.BIGINT()),
                              DataTypes.FIELD("d", DataTypes.BIGINT())]),
                         func_type='pandas')
        t.map(pandas_udf(t.a, t.b)).execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["3,5", "3,7", "6,6", "9,8", "5,8"])
예제 #8
0
 def get_result_type(self) -> DataType:
     return DataTypes.FLOAT()
예제 #9
0
 def test_from_pandas_with_incorrect_schema(self):
     fields = self.data_type.fields.copy()
     fields[0], fields[7] = fields[7], fields[0]  # swap str with tinyint
     wrong_schema = DataTypes.ROW(fields)  # should be DataTypes.STRING()
     with self.assertRaisesRegex(Exception, "Expected a string.*got int8"):
         self.t_env.from_pandas(self.pdf, schema=wrong_schema)
예제 #10
0
 def get_result_type(self):
     return DataTypes.BIGINT()
예제 #11
0
 def get_accumulator_type(self) -> DataType:
     return DataTypes.ARRAY(DataTypes.BIGINT())
예제 #12
0
    def test_basic_type(self):
        test_types = [
            DataTypes.STRING(),
            DataTypes.BOOLEAN(),
            DataTypes.BYTES(),
            DataTypes.TINYINT(),
            DataTypes.SMALLINT(),
            DataTypes.INT(),
            DataTypes.BIGINT(),
            DataTypes.FLOAT(),
            DataTypes.DOUBLE(),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(3)
        ]

        java_types = [_to_java_type(item) for item in test_types]

        converted_python_types = [_from_java_type(item) for item in java_types]

        self.assertEqual(test_types, converted_python_types)
예제 #13
0
    def test_verify_type_not_nullable(self):
        import array
        import datetime
        import decimal

        schema = DataTypes.ROW([
            DataTypes.FIELD('s', DataTypes.STRING(nullable=False)),
            DataTypes.FIELD('i', DataTypes.INT(True))
        ])

        class MyObj:
            def __init__(self, **kwargs):
                for k, v in kwargs.items():
                    setattr(self, k, v)

        # obj, data_type
        success_spec = [
            # String
            ("", DataTypes.STRING()),
            (u"", DataTypes.STRING()),

            # UDT
            (ExamplePoint(1.0, 2.0), ExamplePointUDT()),

            # Boolean
            (True, DataTypes.BOOLEAN()),

            # TinyInt
            (-(2**7), DataTypes.TINYINT()),
            (2**7 - 1, DataTypes.TINYINT()),

            # SmallInt
            (-(2**15), DataTypes.SMALLINT()),
            (2**15 - 1, DataTypes.SMALLINT()),

            # Int
            (-(2**31), DataTypes.INT()),
            (2**31 - 1, DataTypes.INT()),

            # BigInt
            (2**64, DataTypes.BIGINT()),

            # Float & Double
            (1.0, DataTypes.FLOAT()),
            (1.0, DataTypes.DOUBLE()),

            # Decimal
            (decimal.Decimal("1.0"), DataTypes.DECIMAL(10, 0)),

            # Binary
            (bytearray([1]), DataTypes.BINARY(1)),

            # Date/Time/Timestamp
            (datetime.date(2000, 1, 2), DataTypes.DATE()),
            (datetime.datetime(2000, 1, 2, 3, 4), DataTypes.DATE()),
            (datetime.time(1, 1, 2), DataTypes.TIME()),
            (datetime.datetime(2000, 1, 2, 3, 4), DataTypes.TIMESTAMP()),

            # Array
            ([], DataTypes.ARRAY(DataTypes.INT())),
            (["1", None], DataTypes.ARRAY(DataTypes.STRING(nullable=True))),
            ([1, 2], DataTypes.ARRAY(DataTypes.INT())),
            ((1, 2), DataTypes.ARRAY(DataTypes.INT())),
            (array.array('h', [1, 2]), DataTypes.ARRAY(DataTypes.INT())),

            # Map
            ({}, DataTypes.MAP(DataTypes.STRING(), DataTypes.INT())),
            ({
                "a": 1
            }, DataTypes.MAP(DataTypes.STRING(), DataTypes.INT())),
            ({
                "a": None
            },
             DataTypes.MAP(DataTypes.STRING(nullable=False),
                           DataTypes.INT(True))),

            # Struct
            ({
                "s": "a",
                "i": 1
            }, schema),
            ({
                "s": "a",
                "i": None
            }, schema),
            ({
                "s": "a"
            }, schema),
            ({
                "s": "a",
                "f": 1.0
            }, schema),
            (Row(s="a", i=1), schema),
            (Row(s="a", i=None), schema),
            (Row(s="a", i=1, f=1.0), schema),
            (["a", 1], schema),
            (["a", None], schema),
            (("a", 1), schema),
            (MyObj(s="a", i=1), schema),
            (MyObj(s="a", i=None), schema),
            (MyObj(s="a"), schema),
        ]

        # obj, data_type, exception class
        failure_spec = [
            # Char/VarChar (match anything but None)
            (None, DataTypes.VARCHAR(1), ValueError),
            (None, DataTypes.CHAR(1), ValueError),

            # VarChar (length exceeds maximum length)
            ("abc", DataTypes.VARCHAR(1), ValueError),
            # Char (length exceeds length)
            ("abc", DataTypes.CHAR(1), ValueError),

            # UDT
            (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError),

            # Boolean
            (1, DataTypes.BOOLEAN(), TypeError),
            ("True", DataTypes.BOOLEAN(), TypeError),
            ([1], DataTypes.BOOLEAN(), TypeError),

            # TinyInt
            (-(2**7) - 1, DataTypes.TINYINT(), ValueError),
            (2**7, DataTypes.TINYINT(), ValueError),
            ("1", DataTypes.TINYINT(), TypeError),
            (1.0, DataTypes.TINYINT(), TypeError),

            # SmallInt
            (-(2**15) - 1, DataTypes.SMALLINT(), ValueError),
            (2**15, DataTypes.SMALLINT(), ValueError),

            # Int
            (-(2**31) - 1, DataTypes.INT(), ValueError),
            (2**31, DataTypes.INT(), ValueError),

            # Float & Double
            (1, DataTypes.FLOAT(), TypeError),
            (1, DataTypes.DOUBLE(), TypeError),

            # Decimal
            (1.0, DataTypes.DECIMAL(10, 0), TypeError),
            (1, DataTypes.DECIMAL(10, 0), TypeError),
            ("1.0", DataTypes.DECIMAL(10, 0), TypeError),

            # Binary
            (1, DataTypes.BINARY(1), TypeError),
            # VarBinary (length exceeds maximum length)
            (bytearray([1, 2]), DataTypes.VARBINARY(1), ValueError),
            # Char (length exceeds length)
            (bytearray([1, 2]), DataTypes.BINARY(1), ValueError),

            # Date/Time/Timestamp
            ("2000-01-02", DataTypes.DATE(), TypeError),
            ("10:01:02", DataTypes.TIME(), TypeError),
            (946811040, DataTypes.TIMESTAMP(), TypeError),

            # Array
            (["1", None], DataTypes.ARRAY(DataTypes.VARCHAR(1,
                                                            nullable=False)),
             ValueError),
            ([1, "2"], DataTypes.ARRAY(DataTypes.INT()), TypeError),

            # Map
            ({
                "a": 1
            }, DataTypes.MAP(DataTypes.INT(), DataTypes.INT()), TypeError),
            ({
                "a": "1"
            }, DataTypes.MAP(DataTypes.VARCHAR(1),
                             DataTypes.INT()), TypeError),
            ({
                "a": None
            }, DataTypes.MAP(DataTypes.VARCHAR(1),
                             DataTypes.INT(False)), ValueError),

            # Struct
            ({
                "s": "a",
                "i": "1"
            }, schema, TypeError),
            (Row(s="a"), schema, ValueError),  # Row can't have missing field
            (Row(s="a", i="1"), schema, TypeError),
            (["a"], schema, ValueError),
            (["a", "1"], schema, TypeError),
            (MyObj(s="a", i="1"), schema, TypeError),
            (MyObj(s=None, i="1"), schema, ValueError),
        ]

        # Check success cases
        for obj, data_type in success_spec:
            try:
                _create_type_verifier(data_type.not_null())(obj)
            except (TypeError, ValueError):
                self.fail("verify_type(%s, %s, nullable=False)" %
                          (obj, data_type))

        # Check failure cases
        for obj, data_type, exp in failure_spec:
            msg = "verify_type(%s, %s, nullable=False) == %s" % (
                obj, data_type, exp)
            with self.assertRaises(exp, msg=msg):
                _create_type_verifier(data_type.not_null())(obj)
예제 #14
0
 def test_timestamp_microsecond(self):
     tst = DataTypes.TIMESTAMP()
     self.assertEqual(
         tst.to_sql_type(datetime.datetime.max) % 1000000, 999999)
예제 #15
0
    def test_sliding_group_window_over_time(self):
        # create source file path
        import tempfile
        import os
        tmp_dir = tempfile.gettempdir()
        data = [
            '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_sliding_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        from pyflink.table.window import Slide
        self.env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
        self.t_env.register_function("mean_udaf", mean_udaf)

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c', 'd'], [
            DataTypes.TINYINT(),
            DataTypes.TIMESTAMP(3),
            DataTypes.TIMESTAMP(3),
            DataTypes.FLOAT()
        ])
        self.t_env.register_table_sink("Results", table_sink)
        t.window(Slide.over("1.hours").every("30.minutes").on("rowtime").alias("w")) \
            .group_by("a, b, w") \
            .select("a, w.start, w.end, mean_udaf(c) as b") \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "1,2018-03-11 02:30:00.0,2018-03-11 03:30:00.0,2.0",
            "1,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.5",
            "1,2018-03-11 03:30:00.0,2018-03-11 04:30:00.0,5.5",
            "1,2018-03-11 04:00:00.0,2018-03-11 05:00:00.0,8.0",
            "2,2018-03-11 02:30:00.0,2018-03-11 03:30:00.0,1.0",
            "2,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.0",
            "2,2018-03-11 03:30:00.0,2018-03-11 04:30:00.0,3.0",
            "3,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.0",
            "3,2018-03-11 02:30:00.0,2018-03-11 03:30:00.0,2.0"
        ])
        os.remove(source_path)
예제 #16
0
 def setUpClass(cls):
     super(PandasConversionTestBase, cls).setUpClass()
     cls.data = [
         (1, 1, 1, 1, True, 1.1, 1.2, 'hello', bytearray(b"aaa"),
          decimal.Decimal('1000000000000000000.01'),
          datetime.date(2014, 9,
                        13), datetime.time(hour=1, minute=0, second=1),
          datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), ['hello', '中文'],
          Row(a=1,
              b='hello',
              c=datetime.datetime(1970, 1, 1, 0, 0, 0, 123000),
              d=[1, 2])),
         (1, 2, 2, 2, False, 2.1, 2.2, 'world', bytearray(b"bbb"),
          decimal.Decimal('1000000000000000000.02'),
          datetime.date(2014, 9,
                        13), datetime.time(hour=1, minute=0, second=1),
          datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), ['hello', '中文'],
          Row(a=1,
              b='hello',
              c=datetime.datetime(1970, 1, 1, 0, 0, 0, 123000),
              d=[1, 2]))
     ]
     cls.data_type = DataTypes.ROW([
         DataTypes.FIELD("f1", DataTypes.TINYINT()),
         DataTypes.FIELD("f2", DataTypes.SMALLINT()),
         DataTypes.FIELD("f3", DataTypes.INT()),
         DataTypes.FIELD("f4", DataTypes.BIGINT()),
         DataTypes.FIELD("f5", DataTypes.BOOLEAN()),
         DataTypes.FIELD("f6", DataTypes.FLOAT()),
         DataTypes.FIELD("f7", DataTypes.DOUBLE()),
         DataTypes.FIELD("f8", DataTypes.STRING()),
         DataTypes.FIELD("f9", DataTypes.BYTES()),
         DataTypes.FIELD("f10", DataTypes.DECIMAL(38, 18)),
         DataTypes.FIELD("f11", DataTypes.DATE()),
         DataTypes.FIELD("f12", DataTypes.TIME()),
         DataTypes.FIELD("f13", DataTypes.TIMESTAMP(3)),
         DataTypes.FIELD("f14", DataTypes.ARRAY(DataTypes.STRING())),
         DataTypes.FIELD(
             "f15",
             DataTypes.ROW([
                 DataTypes.FIELD("a", DataTypes.INT()),
                 DataTypes.FIELD("b", DataTypes.STRING()),
                 DataTypes.FIELD("c", DataTypes.TIMESTAMP(3)),
                 DataTypes.FIELD("d", DataTypes.ARRAY(DataTypes.INT()))
             ]))
     ], False)
     cls.pdf = cls.create_pandas_data_frame()
예제 #17
0
    def test_group_aggregate_function(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT())
            ]))

        table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c'], [
            DataTypes.TINYINT(),
            DataTypes.FLOAT(),
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.INT()),
                DataTypes.FIELD("b", DataTypes.INT())
            ])
        ])
        self.t_env.register_table_sink("Results", table_sink)
        # general udf
        add = udf(lambda a: a + 1, result_type=DataTypes.INT())
        # pandas udf
        substract = udf(lambda a: a - 1,
                        result_type=DataTypes.INT(),
                        func_type="pandas")
        max_udaf = udaf(lambda a: (a.max(), a.min()),
                        result_type=DataTypes.ROW([
                            DataTypes.FIELD("a", DataTypes.INT()),
                            DataTypes.FIELD("b", DataTypes.INT())
                        ]),
                        func_type="pandas")
        t.group_by("a") \
            .select(t.a, mean_udaf(add(t.b)), max_udaf(substract(t.c))) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,6.0,5,2", "2,3.0,3,2", "3,3.0,2,2"])
예제 #18
0
    def test_register_table_source_and_sink(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")]
        self.prepare_csv_source(source_path, data, field_types, field_names)
        sink_path = os.path.join(self.tempdir + '/streaming2.csv')
        if os.path.isfile(sink_path):
            os.remove(sink_path)
        t_env = self.t_env

        t_env.connect(FileSystem().path(source_path))\
             .with_format(OldCsv()
                          .field_delimiter(',')
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .with_schema(Schema()
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .register_table_source_and_sink("source")
        t_env.connect(FileSystem().path(sink_path))\
             .with_format(OldCsv()
                          .field_delimiter(',')
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .with_schema(Schema()
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .register_table_source_and_sink("sink")
        t_env.scan("source") \
             .select("a + 1, b, c") \
             .insert_into("sink")
        self.env.execute()

        with open(sink_path, 'r') as f:
            lines = f.read()
            assert lines == '2,Hi,Hello\n' + "3,Hello,Hello\n"
예제 #19
0
    def test_map(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        func = udf(lambda x: Row(x + 1, x * x), result_type=DataTypes.ROW(
            [DataTypes.FIELD("a", DataTypes.BIGINT()),
             DataTypes.FIELD("b", DataTypes.BIGINT())]))

        t.map(func(t.b)).alias("a", "b") \
            .map(func(t.a)).alias("a", "b") \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["4,9", "3,4", "7,36", "10,81", "5,16"])
예제 #20
0
    def test_field(self):
        schema = Schema()\
            .field("int_field", DataTypes.INT())\
            .field("long_field", DataTypes.BIGINT())\
            .field("string_field", DataTypes.STRING())\
            .field("timestamp_field", DataTypes.TIMESTAMP())\
            .field("time_field", DataTypes.TIME())\
            .field("date_field", DataTypes.DATE())\
            .field("double_field", DataTypes.DOUBLE())\
            .field("float_field", DataTypes.FLOAT())\
            .field("byte_field", DataTypes.TINYINT())\
            .field("short_field", DataTypes.SMALLINT())\
            .field("boolean_field", DataTypes.BOOLEAN())

        properties = schema.to_properties()
        expected = {
            'schema.0.name': 'int_field',
            'schema.0.type': 'INT',
            'schema.1.name': 'long_field',
            'schema.1.type': 'BIGINT',
            'schema.2.name': 'string_field',
            'schema.2.type': 'VARCHAR',
            'schema.3.name': 'timestamp_field',
            'schema.3.type': 'TIMESTAMP',
            'schema.4.name': 'time_field',
            'schema.4.type': 'TIME',
            'schema.5.name': 'date_field',
            'schema.5.type': 'DATE',
            'schema.6.name': 'double_field',
            'schema.6.type': 'DOUBLE',
            'schema.7.name': 'float_field',
            'schema.7.type': 'FLOAT',
            'schema.8.name': 'byte_field',
            'schema.8.type': 'TINYINT',
            'schema.9.name': 'short_field',
            'schema.9.type': 'SMALLINT',
            'schema.10.name': 'boolean_field',
            'schema.10.type': 'BOOLEAN'
        }
        self.assertEqual(expected, properties)
예제 #21
0
    def test_flat_map(self):
        t = self.t_env.from_elements(
            [(1, "2,3", 3), (2, "1", 3), (1, "5,6,7", 4)],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.STRING()),
                 DataTypes.FIELD("c", DataTypes.INT())]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.STRING()])
        self.t_env.register_table_sink("Results", table_sink)

        @udtf(result_types=[DataTypes.INT(), DataTypes.STRING()])
        def split(x, string):
            for s in string.split(","):
                yield x, s

        t.flat_map(split(t.a, t.b)) \
            .alias("a, b") \
            .flat_map(split(t.a, t.b)) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,2", "1,3", "2,1", "1,5", "1,6", "1,7"])
예제 #22
0
             mean_udaf(b)
             over (PARTITION BY a ORDER BY proctime
             ROWS BETWEEN 1 PRECEDING AND CURRENT ROW),
             max_add_min_udaf(b)
             over (PARTITION BY a ORDER BY proctime
             ROWS BETWEEN 1 PRECEDING AND CURRENT ROW)
            from source_table
        """).wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "1,1.0,2", "1,3.0,6", "1,6.5,13", "2,1.0,2", "2,2.0,4", "3,2.0,4"
        ])
        os.remove(source_path)


@udaf(result_type=DataTypes.FLOAT(), func_type="pandas")
def mean_udaf(v):
    return v.mean()


class MaxAdd(AggregateFunction, unittest.TestCase):
    def open(self, function_context):
        mg = function_context.get_metric_group()
        self.counter = mg.add_group("key", "value").counter("my_counter")
        self.counter_sum = 0

    def get_value(self, accumulator):
        # counter
        self.counter.inc(10)
        self.counter_sum += 10
        return accumulator[0]
예제 #23
0
    def test_map_type(self):
        test_types = [
            DataTypes.MAP(DataTypes.BIGINT(), DataTypes.BIGINT()),
            DataTypes.MAP(DataTypes.STRING(), DataTypes.STRING()),
            DataTypes.MAP(
                DataTypes.STRING(),
                DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())),
            DataTypes.MAP(
                DataTypes.STRING(),
                DataTypes.MAP(DataTypes.STRING(), DataTypes.STRING()))
        ]

        java_types = [_to_java_type(item) for item in test_types]

        converted_python_types = [_from_java_type(item) for item in java_types]

        self.assertEqual(test_types, converted_python_types)
예제 #24
0
    def test_tumble_group_window_aggregate_function(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [(1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
             (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
             (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
             (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
             (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
             (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT()),
                DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))
            ]))

        table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c'], [
            DataTypes.TIMESTAMP(3),
            DataTypes.TIMESTAMP(3),
            DataTypes.FLOAT()
        ])
        self.t_env.register_table_sink("Results", table_sink)
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        tumble_window = Tumble.over(expr.lit(1).hours) \
            .on(expr.col("rowtime")) \
            .alias("w")
        t.window(tumble_window) \
            .group_by("w") \
            .select("w.start, w.end, mean_udaf(b)") \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.2",
            "2018-03-11 04:00:00.0,2018-03-11 05:00:00.0,8.0"
        ])
예제 #25
0
    def test_merge_type(self):
        self.assertEqual(_merge_type(DataTypes.BIGINT(), DataTypes.NULL()),
                         DataTypes.BIGINT())
        self.assertEqual(_merge_type(DataTypes.NULL(), DataTypes.BIGINT()),
                         DataTypes.BIGINT())

        self.assertEqual(_merge_type(DataTypes.BIGINT(), DataTypes.BIGINT()),
                         DataTypes.BIGINT())

        self.assertEqual(
            _merge_type(DataTypes.ARRAY(DataTypes.BIGINT()),
                        DataTypes.ARRAY(DataTypes.BIGINT())),
            DataTypes.ARRAY(DataTypes.BIGINT()))
        with self.assertRaises(TypeError):
            _merge_type(DataTypes.ARRAY(DataTypes.BIGINT()),
                        DataTypes.ARRAY(DataTypes.DOUBLE()))

        self.assertEqual(
            _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()),
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())),
            DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))
        with self.assertRaises(TypeError):
            _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()),
                        DataTypes.MAP(DataTypes.DOUBLE(), DataTypes.BIGINT()))
        with self.assertRaises(TypeError):
            _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()),
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.BIGINT()),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.BIGINT()),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ])),
            DataTypes.ROW([
                DataTypes.FIELD('f1', DataTypes.BIGINT()),
                DataTypes.FIELD('f2', DataTypes.STRING())
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.BIGINT()),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.DOUBLE()),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ROW(
                            [DataTypes.FIELD('f2', DataTypes.BIGINT())]))
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ROW(
                            [DataTypes.FIELD('f2', DataTypes.BIGINT())]))
                ])),
            DataTypes.ROW([
                DataTypes.FIELD(
                    'f1',
                    DataTypes.ROW([DataTypes.FIELD('f2', DataTypes.BIGINT())]))
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ROW(
                            [DataTypes.FIELD('f2', DataTypes.BIGINT())]))
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ROW(
                            [DataTypes.FIELD('f2', DataTypes.STRING())]))
                ]))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ])),
            DataTypes.ROW([
                DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())),
                DataTypes.FIELD('f2', DataTypes.STRING())
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.DOUBLE())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ])),
            DataTypes.ROW([
                DataTypes.FIELD(
                    'f1', DataTypes.MAP(DataTypes.STRING(),
                                        DataTypes.BIGINT())),
                DataTypes.FIELD('f2', DataTypes.STRING())
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ARRAY(
                            DataTypes.MAP(DataTypes.STRING(),
                                          DataTypes.BIGINT())))
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ARRAY(
                            DataTypes.MAP(DataTypes.STRING(),
                                          DataTypes.BIGINT())))
                ])),
            DataTypes.ROW([
                DataTypes.FIELD(
                    'f1',
                    DataTypes.ARRAY(
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())))
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ARRAY(
                            DataTypes.MAP(DataTypes.STRING(),
                                          DataTypes.BIGINT())))
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ARRAY(
                            DataTypes.MAP(DataTypes.DOUBLE(),
                                          DataTypes.BIGINT())))
                ]))
예제 #26
0
    def test_slide_group_window_aggregate_function(self):
        import datetime
        from pyflink.table.window import Slide
        t = self.t_env.from_elements(
            [(1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
             (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
             (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
             (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
             (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
             (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT()),
                DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))
            ]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e'], [
                DataTypes.TINYINT(),
                DataTypes.TIMESTAMP(3),
                DataTypes.TIMESTAMP(3),
                DataTypes.FLOAT(),
                DataTypes.INT()
            ])
        self.t_env.register_table_sink("Results", table_sink)
        self.t_env.register_function(
            "max_add",
            udaf(MaxAdd(), result_type=DataTypes.INT(), func_type="pandas"))
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        slide_window = Slide.over(expr.lit(1).hours) \
            .every(expr.lit(30).minutes) \
            .on(expr.col("rowtime")) \
            .alias("w")
        t.window(slide_window) \
            .group_by("a, w") \
            .select("a, w.start, w.end, mean_udaf(b), max_add(b, c, 1)") \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "1,2018-03-11 02:30:00.0,2018-03-11 03:30:00.0,2.0,6",
            "1,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.5,7",
            "1,2018-03-11 03:30:00.0,2018-03-11 04:30:00.0,5.5,14",
            "1,2018-03-11 04:00:00.0,2018-03-11 05:00:00.0,8.0,14",
            "2,2018-03-11 02:30:00.0,2018-03-11 03:30:00.0,1.0,4",
            "2,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.0,10",
            "2,2018-03-11 03:30:00.0,2018-03-11 04:30:00.0,3.0,10",
            "3,2018-03-11 03:00:00.0,2018-03-11 04:00:00.0,2.0,7",
            "3,2018-03-11 02:30:00.0,2018-03-11 03:30:00.0,2.0,7"
        ])
예제 #27
0
 def test_data_type_eq(self):
     lt = DataTypes.BIGINT()
     lt2 = pickle.loads(pickle.dumps(DataTypes.BIGINT()))
     self.assertEqual(lt, lt2)
예제 #28
0
    def test_over_window_aggregate_function(self):
        import datetime
        t = self.t_env.from_elements(
            [(1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
             (3, 2, 1, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
             (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
             (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
             (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
             (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT()),
                DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))
            ]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], [
                DataTypes.TINYINT(),
                DataTypes.FLOAT(),
                DataTypes.INT(),
                DataTypes.FLOAT(),
                DataTypes.FLOAT(),
                DataTypes.FLOAT(),
                DataTypes.FLOAT(),
                DataTypes.FLOAT(),
                DataTypes.FLOAT(),
                DataTypes.FLOAT()
            ])
        self.t_env.register_table_sink("Results", table_sink)
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        self.t_env.register_function(
            "max_add",
            udaf(MaxAdd(), result_type=DataTypes.INT(), func_type="pandas"))
        self.t_env.register_table("T", t)
        self.t_env.execute_sql("""
            insert into Results
            select a,
             mean_udaf(b)
             over (PARTITION BY a ORDER BY rowtime
             ROWS BETWEEN UNBOUNDED preceding AND UNBOUNDED FOLLOWING),
             max_add(b, c)
             over (PARTITION BY a ORDER BY rowtime
             ROWS BETWEEN UNBOUNDED preceding AND 0 FOLLOWING),
             mean_udaf(b)
             over (PARTITION BY a ORDER BY rowtime
             ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING),
             mean_udaf(c)
             over (PARTITION BY a ORDER BY rowtime
             ROWS BETWEEN 1 PRECEDING AND 0 FOLLOWING),
             mean_udaf(c)
             over (PARTITION BY a ORDER BY rowtime
             RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING),
             mean_udaf(b)
             over (PARTITION BY a ORDER BY rowtime
             RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
             mean_udaf(b)
             over (PARTITION BY a ORDER BY rowtime
             RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND UNBOUNDED FOLLOWING),
             mean_udaf(c)
             over (PARTITION BY a ORDER BY rowtime
             RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND UNBOUNDED FOLLOWING),
             mean_udaf(c)
             over (PARTITION BY a ORDER BY rowtime
             RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND CURRENT ROW)
            from T
        """).wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "1,4.3333335,5,4.3333335,3.0,3.0,2.5,4.3333335,3.0,2.0",
            "1,4.3333335,13,5.5,3.0,3.0,4.3333335,8.0,5.0,5.0",
            "1,4.3333335,6,4.3333335,2.0,3.0,2.5,4.3333335,3.0,2.0",
            "2,2.0,9,2.0,4.0,4.0,2.0,2.0,4.0,4.0",
            "2,2.0,3,2.0,2.0,4.0,1.0,2.0,4.0,2.0",
            "3,2.0,3,2.0,1.0,1.0,2.0,2.0,1.0,1.0"
        ])
예제 #29
0
 def test_datetype_equal_zero(self):
     dt = DataTypes.DATE()
     self.assertEqual(dt.from_sql_type(0), datetime.date(1970, 1, 1))
    def test_array_type(self):
        test_types = [
            DataTypes.ARRAY(DataTypes.BIGINT()),
            # array type with not null basic data type means primitive array
            DataTypes.ARRAY(DataTypes.BIGINT().not_null()),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT())),
            DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))
        ]

        java_types = [_to_java_type(item) for item in test_types]

        converted_python_types = [_from_java_type(item) for item in java_types]

        self.assertEqual(test_types, converted_python_types)