Пример #1
0
 def setUpClass(cls):
     super(PandasConversionTestBase, cls).setUpClass()
     cls.data = [(1, 1, 1, 1, True, 1.1, 1.2, 'hello', bytearray(b"aaa"),
                  decimal.Decimal('1000000000000000000.01'), datetime.date(2014, 9, 13),
                  datetime.time(hour=1, minute=0, second=1),
                  datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), ['hello', '中文'],
                  Row(a=1, b='hello', c=datetime.datetime(1970, 1, 1, 0, 0, 0, 123000),
                      d=[1, 2])),
                 (1, 2, 2, 2, False, 2.1, 2.2, 'world', bytearray(b"bbb"),
                  decimal.Decimal('1000000000000000000.02'), datetime.date(2014, 9, 13),
                  datetime.time(hour=1, minute=0, second=1),
                  datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), ['hello', '中文'],
                  Row(a=1, b='hello', c=datetime.datetime(1970, 1, 1, 0, 0, 0, 123000),
                      d=[1, 2]))]
     cls.data_type = DataTypes.ROW(
         [DataTypes.FIELD("f1", DataTypes.TINYINT()),
          DataTypes.FIELD("f2", DataTypes.SMALLINT()),
          DataTypes.FIELD("f3", DataTypes.INT()),
          DataTypes.FIELD("f4", DataTypes.BIGINT()),
          DataTypes.FIELD("f5", DataTypes.BOOLEAN()),
          DataTypes.FIELD("f6", DataTypes.FLOAT()),
          DataTypes.FIELD("f7", DataTypes.DOUBLE()),
          DataTypes.FIELD("f8", DataTypes.STRING()),
          DataTypes.FIELD("f9", DataTypes.BYTES()),
          DataTypes.FIELD("f10", DataTypes.DECIMAL(38, 18)),
          DataTypes.FIELD("f11", DataTypes.DATE()),
          DataTypes.FIELD("f12", DataTypes.TIME()),
          DataTypes.FIELD("f13", DataTypes.TIMESTAMP(3)),
          DataTypes.FIELD("f14", DataTypes.ARRAY(DataTypes.STRING())),
          DataTypes.FIELD("f15", DataTypes.ROW(
              [DataTypes.FIELD("a", DataTypes.INT()),
               DataTypes.FIELD("b", DataTypes.STRING()),
               DataTypes.FIELD("c", DataTypes.TIMESTAMP(3)),
               DataTypes.FIELD("d", DataTypes.ARRAY(DataTypes.INT()))]))], False)
     cls.pdf = cls.create_pandas_data_frame()
Пример #2
0
    def _create_judf(self, serialized_func, j_input_types, j_function_kind):
        if self._func_type == "pandas":
            from pyflink.table.types import DataTypes
            self._accumulator_type = DataTypes.ARRAY(self._result_type)

        if j_input_types is not None:
            gateway = get_gateway()
            j_input_types = java_utils.to_jarray(
                gateway.jvm.DataType,
                [_to_java_data_type(i) for i in self._input_types])
        j_result_type = _to_java_data_type(self._result_type)
        j_accumulator_type = _to_java_data_type(self._accumulator_type)

        gateway = get_gateway()
        if self._is_table_aggregate:
            PythonAggregateFunction = gateway.jvm \
                .org.apache.flink.table.functions.python.PythonTableAggregateFunction
        else:
            PythonAggregateFunction = gateway.jvm \
                .org.apache.flink.table.functions.python.PythonAggregateFunction
        j_aggregate_function = PythonAggregateFunction(
            self._name, bytearray(serialized_func), j_input_types,
            j_result_type, j_accumulator_type, j_function_kind,
            self._deterministic, self._takes_row_as_input, _get_python_env())
        return j_aggregate_function
Пример #3
0
    def test_nested_udt_in_df(self):
        expected_schema = DataTypes.ROW() \
            .add("_1", DataTypes.BIGINT()).add("_2", DataTypes.ARRAY(PythonOnlyUDT()))
        data = (1, [PythonOnlyPoint(float(1), float(2))])
        self.assertEqual(expected_schema, _infer_type(data))

        expected_schema = DataTypes.ROW().add("_1", DataTypes.BIGINT()).add(
            "_2", DataTypes.MAP(DataTypes.BIGINT(False), PythonOnlyUDT()))
        p = (1, {1: PythonOnlyPoint(1, float(2))})
        self.assertEqual(expected_schema, _infer_type(p))
Пример #4
0
def _create_parquet_array_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]:
    row_type = DataTypes.ROW([
        DataTypes.FIELD(
            'string_array',
            DataTypes.ARRAY(DataTypes.STRING()).bridged_to('java.util.ArrayList')
        ),
        DataTypes.FIELD(
            'int_array',
            DataTypes.ARRAY(DataTypes.INT()).bridged_to('java.util.ArrayList')
        ),
    ])
    row_type_info = Types.ROW_NAMED([
        'string_array',
        'int_array',
    ], [
        Types.LIST(Types.STRING()),
        Types.LIST(Types.INT()),
    ])
    data = [Row(
        string_array=['a', 'b', 'c'],
        int_array=[1, 2, 3],
    )]
    return row_type, row_type_info, data
Пример #5
0
    def test_array_type(self):
        # nullable/not_null flag will be lost during the conversion.
        test_types = [DataTypes.ARRAY(DataTypes.BIGINT()),
                      DataTypes.ARRAY(DataTypes.BIGINT()),
                      DataTypes.ARRAY(DataTypes.STRING()),
                      DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT())),
                      DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))]

        java_types = [_to_java_type(item) for item in test_types]

        converted_python_types = [_from_java_type(item) for item in java_types]

        self.assertEqual(test_types, converted_python_types)
Пример #6
0
    def test_array_type(self):
        test_types = [DataTypes.ARRAY(DataTypes.BIGINT()),
                      # array type with not null basic data type means primitive array
                      DataTypes.ARRAY(DataTypes.BIGINT().not_null()),
                      DataTypes.ARRAY(DataTypes.STRING()),
                      DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT())),
                      DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))]

        java_types = [_to_java_type(item) for item in test_types]

        converted_python_types = [_from_java_type(item) for item in java_types]

        self.assertEqual(test_types, converted_python_types)
Пример #7
0
def _create_parquet_array_row_and_data() -> Tuple[RowType, RowTypeInfo, RowTypeInfo, List[Row]]:
    row_type = DataTypes.ROW([
        DataTypes.FIELD('string_array', DataTypes.ARRAY(DataTypes.STRING())),
        DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT())),
    ])
    row_type_info = Types.ROW_NAMED([
        'string_array',
        'int_array',
    ], [
        Types.LIST(Types.STRING()),
        Types.LIST(Types.INT()),
    ])
    conversion_row_type_info = Types.ROW_NAMED([
        'string_array',
        'int_array',
    ], [
        Types.OBJECT_ARRAY(Types.STRING()),
        Types.OBJECT_ARRAY(Types.INT()),
    ])
    data = [Row(
        string_array=['a', 'b', 'c'],
        int_array=[1, 2, 3],
    )]
    return row_type, row_type_info, conversion_row_type_info, data
Пример #8
0
    def _create_judf(self, serialized_func, j_input_types, j_function_kind):
        if self._func_type == "pandas":
            from pyflink.table.types import DataTypes
            self._accumulator_type = DataTypes.ARRAY(self._result_type)

        j_result_type = _to_java_type(self._result_type)
        j_accumulator_type = _to_java_type(self._accumulator_type)

        gateway = get_gateway()
        PythonAggregateFunction = gateway.jvm \
            .org.apache.flink.table.functions.python.PythonAggregateFunction
        j_aggregate_function = PythonAggregateFunction(
            self._name, bytearray(serialized_func), j_input_types,
            j_result_type, j_accumulator_type, j_function_kind,
            self._deterministic, _get_python_env())
        return j_aggregate_function
Пример #9
0
    def add_array_column(self,
                         name: str,
                         separator: str = ';',
                         element_type: Optional[DataType] = DataTypes.STRING()) \
            -> 'CsvSchemaBuilder':
        """
        Add an array column to schema, the type of elements could be specified via ``element_type``,
        which should be primitive types.

        :param name: Name of the column.
        :param separator: Text separator of array elements, default to ``;``.
        :param element_type: DataType of array elements, default to ``DataTypes.STRING()``.
        """
        self._j_schema_builder.addArrayColumn(name, separator)
        self._fields.append(
            DataTypes.FIELD(name, DataTypes.ARRAY(element_type)))
        return self
Пример #10
0
 def sql_type(cls):
     return DataTypes.ARRAY(DataTypes.DOUBLE(False))
Пример #11
0
    def test_verify_type_not_nullable(self):
        import array
        import datetime
        import decimal

        schema = DataTypes.ROW([
            DataTypes.FIELD('s', DataTypes.STRING(nullable=False)),
            DataTypes.FIELD('i', DataTypes.INT(True))
        ])

        class MyObj:
            def __init__(self, **kwargs):
                for k, v in kwargs.items():
                    setattr(self, k, v)

        # obj, data_type
        success_spec = [
            # String
            ("", DataTypes.STRING()),
            (u"", DataTypes.STRING()),

            # UDT
            (ExamplePoint(1.0, 2.0), ExamplePointUDT()),

            # Boolean
            (True, DataTypes.BOOLEAN()),

            # TinyInt
            (-(2**7), DataTypes.TINYINT()),
            (2**7 - 1, DataTypes.TINYINT()),

            # SmallInt
            (-(2**15), DataTypes.SMALLINT()),
            (2**15 - 1, DataTypes.SMALLINT()),

            # Int
            (-(2**31), DataTypes.INT()),
            (2**31 - 1, DataTypes.INT()),

            # BigInt
            (2**64, DataTypes.BIGINT()),

            # Float & Double
            (1.0, DataTypes.FLOAT()),
            (1.0, DataTypes.DOUBLE()),

            # Decimal
            (decimal.Decimal("1.0"), DataTypes.DECIMAL(10, 0)),

            # Binary
            (bytearray([1]), DataTypes.BINARY(1)),

            # Date/Time/Timestamp
            (datetime.date(2000, 1, 2), DataTypes.DATE()),
            (datetime.datetime(2000, 1, 2, 3, 4), DataTypes.DATE()),
            (datetime.time(1, 1, 2), DataTypes.TIME()),
            (datetime.datetime(2000, 1, 2, 3, 4), DataTypes.TIMESTAMP()),

            # Array
            ([], DataTypes.ARRAY(DataTypes.INT())),
            (["1", None], DataTypes.ARRAY(DataTypes.STRING(nullable=True))),
            ([1, 2], DataTypes.ARRAY(DataTypes.INT())),
            ((1, 2), DataTypes.ARRAY(DataTypes.INT())),
            (array.array('h', [1, 2]), DataTypes.ARRAY(DataTypes.INT())),

            # Map
            ({}, DataTypes.MAP(DataTypes.STRING(), DataTypes.INT())),
            ({
                "a": 1
            }, DataTypes.MAP(DataTypes.STRING(), DataTypes.INT())),
            ({
                "a": None
            },
             DataTypes.MAP(DataTypes.STRING(nullable=False),
                           DataTypes.INT(True))),

            # Struct
            ({
                "s": "a",
                "i": 1
            }, schema),
            ({
                "s": "a",
                "i": None
            }, schema),
            ({
                "s": "a"
            }, schema),
            ({
                "s": "a",
                "f": 1.0
            }, schema),
            (Row(s="a", i=1), schema),
            (Row(s="a", i=None), schema),
            (Row(s="a", i=1, f=1.0), schema),
            (["a", 1], schema),
            (["a", None], schema),
            (("a", 1), schema),
            (MyObj(s="a", i=1), schema),
            (MyObj(s="a", i=None), schema),
            (MyObj(s="a"), schema),
        ]

        # obj, data_type, exception class
        failure_spec = [
            # Char/VarChar (match anything but None)
            (None, DataTypes.VARCHAR(1), ValueError),
            (None, DataTypes.CHAR(1), ValueError),

            # VarChar (length exceeds maximum length)
            ("abc", DataTypes.VARCHAR(1), ValueError),
            # Char (length exceeds length)
            ("abc", DataTypes.CHAR(1), ValueError),

            # UDT
            (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError),

            # Boolean
            (1, DataTypes.BOOLEAN(), TypeError),
            ("True", DataTypes.BOOLEAN(), TypeError),
            ([1], DataTypes.BOOLEAN(), TypeError),

            # TinyInt
            (-(2**7) - 1, DataTypes.TINYINT(), ValueError),
            (2**7, DataTypes.TINYINT(), ValueError),
            ("1", DataTypes.TINYINT(), TypeError),
            (1.0, DataTypes.TINYINT(), TypeError),

            # SmallInt
            (-(2**15) - 1, DataTypes.SMALLINT(), ValueError),
            (2**15, DataTypes.SMALLINT(), ValueError),

            # Int
            (-(2**31) - 1, DataTypes.INT(), ValueError),
            (2**31, DataTypes.INT(), ValueError),

            # Float & Double
            (1, DataTypes.FLOAT(), TypeError),
            (1, DataTypes.DOUBLE(), TypeError),

            # Decimal
            (1.0, DataTypes.DECIMAL(10, 0), TypeError),
            (1, DataTypes.DECIMAL(10, 0), TypeError),
            ("1.0", DataTypes.DECIMAL(10, 0), TypeError),

            # Binary
            (1, DataTypes.BINARY(1), TypeError),
            # VarBinary (length exceeds maximum length)
            (bytearray([1, 2]), DataTypes.VARBINARY(1), ValueError),
            # Char (length exceeds length)
            (bytearray([1, 2]), DataTypes.BINARY(1), ValueError),

            # Date/Time/Timestamp
            ("2000-01-02", DataTypes.DATE(), TypeError),
            ("10:01:02", DataTypes.TIME(), TypeError),
            (946811040, DataTypes.TIMESTAMP(), TypeError),

            # Array
            (["1", None], DataTypes.ARRAY(DataTypes.VARCHAR(1,
                                                            nullable=False)),
             ValueError),
            ([1, "2"], DataTypes.ARRAY(DataTypes.INT()), TypeError),

            # Map
            ({
                "a": 1
            }, DataTypes.MAP(DataTypes.INT(), DataTypes.INT()), TypeError),
            ({
                "a": "1"
            }, DataTypes.MAP(DataTypes.VARCHAR(1),
                             DataTypes.INT()), TypeError),
            ({
                "a": None
            }, DataTypes.MAP(DataTypes.VARCHAR(1),
                             DataTypes.INT(False)), ValueError),

            # Struct
            ({
                "s": "a",
                "i": "1"
            }, schema, TypeError),
            (Row(s="a"), schema, ValueError),  # Row can't have missing field
            (Row(s="a", i="1"), schema, TypeError),
            (["a"], schema, ValueError),
            (["a", "1"], schema, TypeError),
            (MyObj(s="a", i="1"), schema, TypeError),
            (MyObj(s=None, i="1"), schema, ValueError),
        ]

        # Check success cases
        for obj, data_type in success_spec:
            try:
                _create_type_verifier(data_type.not_null())(obj)
            except (TypeError, ValueError):
                self.fail("verify_type(%s, %s, nullable=False)" %
                          (obj, data_type))

        # Check failure cases
        for obj, data_type, exp in failure_spec:
            msg = "verify_type(%s, %s, nullable=False) == %s" % (
                obj, data_type, exp)
            with self.assertRaises(exp, msg=msg):
                _create_type_verifier(data_type.not_null())(obj)
Пример #12
0
    def test_merge_type(self):
        self.assertEqual(_merge_type(DataTypes.BIGINT(), DataTypes.NULL()),
                         DataTypes.BIGINT())
        self.assertEqual(_merge_type(DataTypes.NULL(), DataTypes.BIGINT()),
                         DataTypes.BIGINT())

        self.assertEqual(_merge_type(DataTypes.BIGINT(), DataTypes.BIGINT()),
                         DataTypes.BIGINT())

        self.assertEqual(
            _merge_type(DataTypes.ARRAY(DataTypes.BIGINT()),
                        DataTypes.ARRAY(DataTypes.BIGINT())),
            DataTypes.ARRAY(DataTypes.BIGINT()))
        with self.assertRaises(TypeError):
            _merge_type(DataTypes.ARRAY(DataTypes.BIGINT()),
                        DataTypes.ARRAY(DataTypes.DOUBLE()))

        self.assertEqual(
            _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()),
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())),
            DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))
        with self.assertRaises(TypeError):
            _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()),
                        DataTypes.MAP(DataTypes.DOUBLE(), DataTypes.BIGINT()))
        with self.assertRaises(TypeError):
            _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()),
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.BIGINT()),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.BIGINT()),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ])),
            DataTypes.ROW([
                DataTypes.FIELD('f1', DataTypes.BIGINT()),
                DataTypes.FIELD('f2', DataTypes.STRING())
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.BIGINT()),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.DOUBLE()),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ROW(
                            [DataTypes.FIELD('f2', DataTypes.BIGINT())]))
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ROW(
                            [DataTypes.FIELD('f2', DataTypes.BIGINT())]))
                ])),
            DataTypes.ROW([
                DataTypes.FIELD(
                    'f1',
                    DataTypes.ROW([DataTypes.FIELD('f2', DataTypes.BIGINT())]))
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ROW(
                            [DataTypes.FIELD('f2', DataTypes.BIGINT())]))
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ROW(
                            [DataTypes.FIELD('f2', DataTypes.STRING())]))
                ]))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ])),
            DataTypes.ROW([
                DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())),
                DataTypes.FIELD('f2', DataTypes.STRING())
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.DOUBLE())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ])),
            DataTypes.ROW([
                DataTypes.FIELD(
                    'f1', DataTypes.MAP(DataTypes.STRING(),
                                        DataTypes.BIGINT())),
                DataTypes.FIELD('f2', DataTypes.STRING())
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ARRAY(
                            DataTypes.MAP(DataTypes.STRING(),
                                          DataTypes.BIGINT())))
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ARRAY(
                            DataTypes.MAP(DataTypes.STRING(),
                                          DataTypes.BIGINT())))
                ])),
            DataTypes.ROW([
                DataTypes.FIELD(
                    'f1',
                    DataTypes.ARRAY(
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())))
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ARRAY(
                            DataTypes.MAP(DataTypes.STRING(),
                                          DataTypes.BIGINT())))
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ARRAY(
                            DataTypes.MAP(DataTypes.DOUBLE(),
                                          DataTypes.BIGINT())))
                ]))
 def get_accumulator_type(self):
     return DataTypes.ARRAY(DataTypes.BIGINT())
Пример #14
0
def word_count():
    environment_settings = EnvironmentSettings.new_instance().in_batch_mode(
    ).use_blink_planner().build()
    t_env = BatchTableEnvironment.create(
        environment_settings=environment_settings)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    # we should set the Python verison here if `Python` not point
    t_env.get_config().set_python_executable("python3")

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("city", DataTypes.STRING())
                     .field("sales_volume", DataTypes.BIGINT())
                     .field("sales", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("city", DataTypes.STRING())
                     .field("sales_volume", DataTypes.BIGINT())
                     .field("sales", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    @udf(input_types=DataTypes.STRING(),
         result_type=DataTypes.ARRAY(DataTypes.STRING()))
    def split(input_str: str):
        return input_str.split(",")

    @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()),
                      DataTypes.INT()],
         result_type=DataTypes.STRING())
    def get(arr, index):
        return arr[index]

    t_env.register_function("split", split)
    t_env.register_function("get", get)

    t_env.get_config().get_configuration().set_string("parallelism.default",
                                                      "1")

    data = [
        ("iPhone 11,30,5499,Beijing", ), ("iPhone 11 Pro,20,8699,Guangzhou", ),
        ("MacBook Pro,10,9999,Beijing", ), ("AirPods Pro,50,1999,Beijing", ),
        ("MacBook Pro,10,11499,Shanghai", ), ("iPhone 11,30,5999,Shanghai", ),
        ("iPhone 11 Pro,20,9999,Shenzhen", ),
        ("MacBook Pro,10,13899,Hangzhou", ), ("iPhone 11,10,6799,Beijing", ),
        ("MacBook Pro,10,18999,Beijing", ),
        ("iPhone 11 Pro,10,11799,Shenzhen", ),
        ("MacBook Pro,10,22199,Shanghai", ), ("AirPods Pro,40,1999,Shanghai", )
    ]
    t_env.from_elements(data, ["line"]) \
        .select("split(line) as str_array") \
        .select("get(str_array, 3) as city, "
                "get(str_array, 1).cast(LONG) as count, "
                "get(str_array, 2).cast(LONG) as unit_price") \
        .select("city, count, count * unit_price as total_price") \
        .group_by("city") \
        .select("city, "
                "sum(count) as sales_volume, "
                "sum(total_price) as sales") \
        .insert_into("Results")

    t_env.execute("word_count")
Пример #15
0
def test():
    # 1. create a TableEnvironment
    #env_settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
    #table_env = StreamTableEnvironment.create(environment_settings=env_settings)
    env_settings = EnvironmentSettings.new_instance().in_batch_mode(
    ).use_blink_planner().build()
    table_env = BatchTableEnvironment.create(environment_settings=env_settings)

    # 2. create source Table
    table_env.execute_sql("""
        CREATE TABLE source_table (
            Region VARCHAR,
            Country	VARCHAR,
            Item_Type VARCHAR,
            Sales_Channel VARCHAR,
            Order_Priority VARCHAR,
            Order_Date VARCHAR,
            Order_ID VARCHAR,
            Ship_Date VARCHAR,
            Units_Sold VARCHAR,
            Unit_Price VARCHAR,
            Unit_Cost VARCHAR,
            Total_Revenue VARCHAR,
            Total_Cost VARCHAR,
            Total_Profit VARCHAR
        ) WITH (
            'connector' = 'filesystem',
            'path' = '/tmp/data/5m_Sales_Records.csv',
            'format' = 'csv'
        )
    """)

    table_env.execute_sql("""
        CREATE TABLE sink_table (
            Region VARCHAR,
            Country	VARCHAR,
            Item_Type VARCHAR,
            Sales_Channel VARCHAR,
            Order_Priority VARCHAR,
            Order_Date VARCHAR,
            Order_ID VARCHAR,
            Ship_Date VARCHAR,
            Units_Sold VARCHAR,
            Unit_Price VARCHAR,
            Unit_Cost VARCHAR,
            Total_Revenue VARCHAR,
            Total_Cost VARCHAR,
            Total_Profit VARCHAR
        )
          WITH (
            'connector' = 'filesystem',
            'path' = '/tmp/data/xxx_Sales_Records.csv',
            'format' = 'csv'
        )
    """)

    @udf(input_types=DataTypes.STRING(),
         result_type=DataTypes.ARRAY(DataTypes.STRING()))
    def split(input_str: str):
        return input_str.split(",")

    @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()),
                      DataTypes.INT()],
         result_type=DataTypes.STRING())
    def get(arr, index):
        return arr[index]

    table_env.register_function("split", split)
    table_env.register_function("get", get)

    table_env.sql_query("SELECT * FROM source_table order by Region") \
            .execute_insert("sink_table").wait()