Exemplo n.º 1
0
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes
from pyflink.table.udf import udf

# https://flink.apache.org/2020/04/09/pyflink-udf-support-flink.html
# https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/python/table-api-users-guide/udfs/python_udfs.html


@udf(input_types=[DataTypes.STRING()],
     result_type=DataTypes.MAP(DataTypes.STRING(), DataTypes.STRING()))
def parse(s):
    import json
    # a dummy parser
    res = {}
    content = json.loads(s)
    if 'item_id' in content:
        res['item_id'] = str(
            content['item_id'])  # REMEMBER to match the result_type
    if 'tag' in content:
        res['tag'] = content['tag']
    return res


env = StreamExecutionEnvironment.get_execution_environment()
t_env = StreamTableEnvironment.create(env)

t_env.register_function("parse", parse)

my_source_ddl = """
create table mySource (
    id BIGINT,
Exemplo n.º 2
0
    def setUp(self):
        super(PyFlinkEmbeddedThreadTests, self).setUp()
        self.t_env.get_config().set("python.execution-mode", "thread")


class MultiEmit(TableFunction, unittest.TestCase):
    def open(self, function_context):
        self.counter_sum = 0

    def eval(self, x, y):
        self.counter_sum += y
        for i in range(y):
            yield x, i


@udtf(result_types=[DataTypes.BIGINT()])
def identity(x):
    if x is not None:
        from pyflink.common import Row
        return Row(x)


# test specify the input_types
@udtf(input_types=[DataTypes.BIGINT(), DataTypes.BIGINT()],
      result_types=DataTypes.BIGINT())
def condition_multi_emit(x, y):
    if x == 3:
        return range(y, x)


class MultiNum(ScalarFunction):
Exemplo n.º 3
0
def register_rides_source(st_env):
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("Rides")
            .start_from_earliest()
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
            DataTypes.FIELD("rideId", DataTypes.BIGINT()),
            DataTypes.FIELD("isStart", DataTypes.BOOLEAN()),
            DataTypes.FIELD("eventTime", DataTypes.TIMESTAMP()),
            DataTypes.FIELD("lon", DataTypes.FLOAT()),
            DataTypes.FIELD("lat", DataTypes.FLOAT()),
            DataTypes.FIELD("psgCnt", DataTypes.INT()),
            DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("taxiId", DataTypes.BIGINT())
            .field("isStart", DataTypes.BOOLEAN())
            .field("lon", DataTypes.FLOAT())
            .field("lat", DataTypes.FLOAT())
            .field("psgCnt", DataTypes.INT())
            .field("rideTime", DataTypes.TIMESTAMP())
            .rowtime(
            Rowtime()
                .timestamps_from_field("eventTime")
                .watermarks_periodic_bounded(60000))) \
        .in_append_mode() \
        .register_table_source("source")
Exemplo n.º 4
0
            ]))

        t.select("local_zoned_timestamp_func(local_zoned_timestamp_func(a))") \
            .insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1970-01-01T00:00:00.123Z"])


class PyFlinkBlinkBatchUserDefinedFunctionTests(UserDefinedFunctionTests,
                                                PyFlinkBlinkBatchTableTestCase
                                                ):
    pass


@udf(input_types=[DataTypes.BIGINT(), DataTypes.BIGINT()],
     result_type=DataTypes.BIGINT())
def add(i, j):
    return i + j


class SubtractOne(ScalarFunction):
    def eval(self, i):
        return i - 1


class Subtract(ScalarFunction, unittest.TestCase):
    def open(self, function_context):
        self.subtracted_value = 1
        mg = function_context.get_metric_group()
        self.counter = mg.add_group("key", "value").counter("my_counter")
Exemplo n.º 5
0
    def test_scalar_function(self):
        # test metric disabled.
        self.t_env.get_config().get_configuration().set_string(
            'python.metric.enabled', 'false')
        # test lambda function
        self.t_env.register_function(
            "add_one",
            udf(lambda i: i + 1, DataTypes.BIGINT(), DataTypes.BIGINT()))

        # test Python ScalarFunction
        self.t_env.register_function(
            "subtract_one",
            udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT()))

        # test Python function
        self.t_env.register_function("add", add)

        # test callable function
        self.t_env.register_function(
            "add_one_callable",
            udf(CallablePlus(), DataTypes.BIGINT(), DataTypes.BIGINT()))

        def partial_func(col, param):
            return col + param

        # test partial function
        import functools
        self.t_env.register_function(
            "add_one_partial",
            udf(functools.partial(partial_func, param=1), DataTypes.BIGINT(),
                DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e', 'f'], [
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT()
            ])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)],
                                     ['a', 'b', 'c'])
        t.where("add_one(b) <= 3") \
            .select("add_one(a), subtract_one(b), add(a, c), add_one_callable(a), "
                    "add_one_partial(a), a") \
            .insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["2,1,4,2,2,1", "4,0,12,4,4,3"])
Exemplo n.º 6
0
 def get_accumulator_type(self):
     return DataTypes.ROW([
         DataTypes.FIELD(
             "f0", DataTypes.MAP(DataTypes.STRING(), DataTypes.STRING())),
         DataTypes.FIELD("f1", DataTypes.BIGINT())
     ])
Exemplo n.º 7
0
 def get_accumulator_type(self):
     return DataTypes.ARRAY(DataTypes.BIGINT())
Exemplo n.º 8
0
 def test_collect_for_all_data_types(self):
     expected_result = [
         Row(1, None, 1, True, 32767, -2147483648, 1.23, 1.98932,
             bytearray(b'pyflink'), 'pyflink', datetime.date(2014, 9, 13),
             datetime.time(12, 0, 0, 123000),
             datetime.datetime(2018, 3, 11, 3, 0, 0, 123000),
             [Row(['[pyflink]']),
              Row(['[pyflink]']),
              Row(['[pyflink]'])], {
                  1: Row(['[flink]']),
                  2: Row(['[pyflink]'])
              }, decimal.Decimal('1000000000000000000.050000000000000000'),
             decimal.Decimal('1000000000000000000.059999999999999999'))
     ]
     source = self.t_env.from_elements(
         [(1, None, 1, True, 32767, -2147483648, 1.23, 1.98932,
           bytearray(b'pyflink'), 'pyflink', datetime.date(2014, 9, 13),
           datetime.time(hour=12, minute=0, second=0, microsecond=123000),
           datetime.datetime(2018, 3, 11, 3, 0, 0, 123000),
           [Row(['pyflink']),
            Row(['pyflink']),
            Row(['pyflink'])], {
                1: Row(['flink']),
                2: Row(['pyflink'])
            }, decimal.Decimal('1000000000000000000.05'),
           decimal.Decimal(
               '1000000000000000000.05999999999999999899999999999'))],
         DataTypes.ROW([
             DataTypes.FIELD("a", DataTypes.BIGINT()),
             DataTypes.FIELD("b", DataTypes.BIGINT()),
             DataTypes.FIELD("c", DataTypes.TINYINT()),
             DataTypes.FIELD("d", DataTypes.BOOLEAN()),
             DataTypes.FIELD("e", DataTypes.SMALLINT()),
             DataTypes.FIELD("f", DataTypes.INT()),
             DataTypes.FIELD("g", DataTypes.FLOAT()),
             DataTypes.FIELD("h", DataTypes.DOUBLE()),
             DataTypes.FIELD("i", DataTypes.BYTES()),
             DataTypes.FIELD("j", DataTypes.STRING()),
             DataTypes.FIELD("k", DataTypes.DATE()),
             DataTypes.FIELD("l", DataTypes.TIME()),
             DataTypes.FIELD("m", DataTypes.TIMESTAMP(3)),
             DataTypes.FIELD(
                 "n",
                 DataTypes.ARRAY(
                     DataTypes.ROW(
                         [DataTypes.FIELD('ss2', DataTypes.STRING())]))),
             DataTypes.FIELD(
                 "o",
                 DataTypes.MAP(
                     DataTypes.BIGINT(),
                     DataTypes.ROW(
                         [DataTypes.FIELD('ss', DataTypes.STRING())]))),
             DataTypes.FIELD("p", DataTypes.DECIMAL(38, 18)),
             DataTypes.FIELD("q", DataTypes.DECIMAL(38, 18))
         ]))
     table_result = source.execute()
     with table_result.collect() as result:
         collected_result = []
         for i in result:
             collected_result.append(i)
         self.assertEqual(expected_result, collected_result)
Exemplo n.º 9
0
    def test_from_element(self):
        t_env = self.t_env
        a = array.array('b')
        a.fromstring('ABCD')
        t = t_env.from_elements([
            (1, 1.0, "hi", "hello", datetime.date(1970, 1, 2),
             datetime.time(1, 0, 0), datetime.datetime(1970, 1, 2, 0,
                                                       0), [1.0, None],
             array.array("d",
                         [1.0, 2.0]), ["abc"], [datetime.date(1970, 1, 2)],
             Decimal(1), Row("a", "b")(1, 2.0), {
                 "key": 1.0
             }, a, ExamplePoint(1.0, 2.0), PythonOnlyPoint(3.0, 4.0))
        ])
        field_names = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q"
        ]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING(),
            DataTypes.STRING(),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(),
            DataTypes.ARRAY(DataTypes.DOUBLE()),
            DataTypes.ARRAY(DataTypes.DOUBLE(False)),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.DATE()),
            DataTypes.DECIMAL(10, 0),
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.BIGINT()),
                DataTypes.FIELD("b", DataTypes.DOUBLE())
            ]),
            DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()),
            DataTypes.BYTES(),
            ExamplePointUDT(),
            PythonOnlyUDT()
        ]
        table_sink = source_sink_utils.TestAppendSink(field_names, field_types)
        t_env.register_table_sink("Results", table_sink)

        t.insert_into("Results")
        t_env.exec_env().execute()
        actual = source_sink_utils.results()

        expected = [
            '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,[1.0, null],'
            '[1.0, 2.0],[abc],[1970-01-02],1,1,2.0,{key=1.0},[65, 66, 67, 68],[1.0, 2.0],'
            '[3.0, 4.0]'
        ]
        self.assert_equals(actual, expected)
Exemplo n.º 10
0
    def test_all_data_types(self):
        import pandas as pd
        import numpy as np

        def tinyint_func(tinyint_param):
            assert isinstance(tinyint_param, pd.Series)
            assert isinstance(tinyint_param[0], np.int8), \
                'tinyint_param of wrong type %s !' % type(tinyint_param[0])
            return tinyint_param

        def smallint_func(smallint_param):
            assert isinstance(smallint_param, pd.Series)
            assert isinstance(smallint_param[0], np.int16), \
                'smallint_param of wrong type %s !' % type(smallint_param[0])
            assert smallint_param[0] == 32767, 'smallint_param of wrong value %s' % smallint_param
            return smallint_param

        def int_func(int_param):
            assert isinstance(int_param, pd.Series)
            assert isinstance(int_param[0], np.int32), \
                'int_param of wrong type %s !' % type(int_param[0])
            assert int_param[0] == -2147483648, 'int_param of wrong value %s' % int_param
            return int_param

        def bigint_func(bigint_param):
            assert isinstance(bigint_param, pd.Series)
            assert isinstance(bigint_param[0], np.int64), \
                'bigint_param of wrong type %s !' % type(bigint_param[0])
            return bigint_param

        def boolean_func(boolean_param):
            assert isinstance(boolean_param, pd.Series)
            assert isinstance(boolean_param[0], np.bool_), \
                'boolean_param of wrong type %s !' % type(boolean_param[0])
            return boolean_param

        def float_func(float_param):
            assert isinstance(float_param, pd.Series)
            assert isinstance(float_param[0], np.float32), \
                'float_param of wrong type %s !' % type(float_param[0])
            return float_param

        def double_func(double_param):
            assert isinstance(double_param, pd.Series)
            assert isinstance(double_param[0], np.float64), \
                'double_param of wrong type %s !' % type(double_param[0])
            return double_param

        def varchar_func(varchar_param):
            assert isinstance(varchar_param, pd.Series)
            assert isinstance(varchar_param[0], str), \
                'varchar_param of wrong type %s !' % type(varchar_param[0])
            return varchar_param

        def varbinary_func(varbinary_param):
            assert isinstance(varbinary_param, pd.Series)
            assert isinstance(varbinary_param[0], bytes), \
                'varbinary_param of wrong type %s !' % type(varbinary_param[0])
            return varbinary_param

        def decimal_func(decimal_param):
            assert isinstance(decimal_param, pd.Series)
            assert isinstance(decimal_param[0], decimal.Decimal), \
                'decimal_param of wrong type %s !' % type(decimal_param[0])
            return decimal_param

        def date_func(date_param):
            assert isinstance(date_param, pd.Series)
            assert isinstance(date_param[0], datetime.date), \
                'date_param of wrong type %s !' % type(date_param[0])
            return date_param

        def time_func(time_param):
            assert isinstance(time_param, pd.Series)
            assert isinstance(time_param[0], datetime.time), \
                'time_param of wrong type %s !' % type(time_param[0])
            return time_param

        timestamp_value = datetime.datetime(1970, 1, 2, 0, 0, 0, 123000)

        def timestamp_func(timestamp_param):
            assert isinstance(timestamp_param, pd.Series)
            assert isinstance(timestamp_param[0], datetime.datetime), \
                'timestamp_param of wrong type %s !' % type(timestamp_param[0])
            assert timestamp_param[0] == timestamp_value, \
                'timestamp_param is wrong value %s, should be %s!' % (timestamp_param[0],
                                                                      timestamp_value)
            return timestamp_param

        def array_func(array_param):
            assert isinstance(array_param, pd.Series)
            assert isinstance(array_param[0], np.ndarray), \
                'array_param of wrong type %s !' % type(array_param[0])
            return array_param

        def nested_array_func(nested_array_param):
            assert isinstance(nested_array_param, pd.Series)
            assert isinstance(nested_array_param[0], np.ndarray), \
                'nested_array_param of wrong type %s !' % type(nested_array_param[0])
            return pd.Series(nested_array_param[0])

        def row_func(row_param):
            assert isinstance(row_param, pd.Series)
            assert isinstance(row_param[0], dict), \
                'row_param of wrong type %s !' % type(row_param[0])
            return row_param

        self.t_env.create_temporary_system_function(
            "tinyint_func",
            udf(tinyint_func, result_type=DataTypes.TINYINT(), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "smallint_func",
            udf(smallint_func, result_type=DataTypes.SMALLINT(), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "int_func",
            udf(int_func, result_type=DataTypes.INT(), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "bigint_func",
            udf(bigint_func, result_type=DataTypes.BIGINT(), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "boolean_func",
            udf(boolean_func, result_type=DataTypes.BOOLEAN(), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "float_func",
            udf(float_func, result_type=DataTypes.FLOAT(), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "double_func",
            udf(double_func, result_type=DataTypes.DOUBLE(), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "varchar_func",
            udf(varchar_func, result_type=DataTypes.STRING(), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "varbinary_func",
            udf(varbinary_func, result_type=DataTypes.BYTES(), udf_type="pandas"))

        self.t_env.register_function(
            "decimal_func",
            udf(decimal_func, result_type=DataTypes.DECIMAL(38, 18), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "date_func",
            udf(date_func, result_type=DataTypes.DATE(), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "time_func",
            udf(time_func, result_type=DataTypes.TIME(), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "timestamp_func",
            udf(timestamp_func, result_type=DataTypes.TIMESTAMP(3), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "array_str_func",
            udf(array_func, result_type=DataTypes.ARRAY(DataTypes.STRING()), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "array_timestamp_func",
            udf(array_func, result_type=DataTypes.ARRAY(DataTypes.TIMESTAMP(3)), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "array_int_func",
            udf(array_func, result_type=DataTypes.ARRAY(DataTypes.INT()), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "nested_array_func",
            udf(nested_array_func,
                result_type=DataTypes.ARRAY(DataTypes.STRING()), udf_type="pandas"))

        row_type = DataTypes.ROW(
            [DataTypes.FIELD("f1", DataTypes.INT()),
             DataTypes.FIELD("f2", DataTypes.STRING()),
             DataTypes.FIELD("f3", DataTypes.TIMESTAMP(3)),
             DataTypes.FIELD("f4", DataTypes.ARRAY(DataTypes.INT()))])
        self.t_env.create_temporary_system_function(
            "row_func",
            udf(row_func, result_type=row_type, udf_type="pandas"))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
             'r', 's', 't', 'u'],
            [DataTypes.TINYINT(), DataTypes.SMALLINT(), DataTypes.INT(), DataTypes.BIGINT(),
             DataTypes.BOOLEAN(), DataTypes.BOOLEAN(), DataTypes.FLOAT(), DataTypes.DOUBLE(),
             DataTypes.STRING(), DataTypes.STRING(), DataTypes.BYTES(), DataTypes.DECIMAL(38, 18),
             DataTypes.DECIMAL(38, 18), DataTypes.DATE(), DataTypes.TIME(), DataTypes.TIMESTAMP(3),
             DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.TIMESTAMP(3)),
             DataTypes.ARRAY(DataTypes.INT()),
             DataTypes.ARRAY(DataTypes.STRING()), row_type])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements(
            [(1, 32767, -2147483648, 1, True, False, 1.0, 1.0, 'hello', '中文',
              bytearray(b'flink'), decimal.Decimal('1000000000000000000.05'),
              decimal.Decimal('1000000000000000000.05999999999999999899999999999'),
              datetime.date(2014, 9, 13), datetime.time(hour=1, minute=0, second=1),
              timestamp_value, ['hello', '中文', None], [timestamp_value], [1, 2],
              [['hello', '中文', None]], Row(1, 'hello', timestamp_value, [1, 2]))],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("d", DataTypes.BIGINT()),
                 DataTypes.FIELD("e", DataTypes.BOOLEAN()),
                 DataTypes.FIELD("f", DataTypes.BOOLEAN()),
                 DataTypes.FIELD("g", DataTypes.FLOAT()),
                 DataTypes.FIELD("h", DataTypes.DOUBLE()),
                 DataTypes.FIELD("i", DataTypes.STRING()),
                 DataTypes.FIELD("j", DataTypes.STRING()),
                 DataTypes.FIELD("k", DataTypes.BYTES()),
                 DataTypes.FIELD("l", DataTypes.DECIMAL(38, 18)),
                 DataTypes.FIELD("m", DataTypes.DECIMAL(38, 18)),
                 DataTypes.FIELD("n", DataTypes.DATE()),
                 DataTypes.FIELD("o", DataTypes.TIME()),
                 DataTypes.FIELD("p", DataTypes.TIMESTAMP(3)),
                 DataTypes.FIELD("q", DataTypes.ARRAY(DataTypes.STRING())),
                 DataTypes.FIELD("r", DataTypes.ARRAY(DataTypes.TIMESTAMP(3))),
                 DataTypes.FIELD("s", DataTypes.ARRAY(DataTypes.INT())),
                 DataTypes.FIELD("t", DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))),
                 DataTypes.FIELD("u", row_type)]))

        exec_insert_table(t.select("tinyint_func(a),"
                                   "smallint_func(b),"
                                   "int_func(c),"
                                   "bigint_func(d),"
                                   "boolean_func(e),"
                                   "boolean_func(f),"
                                   "float_func(g),"
                                   "double_func(h),"
                                   "varchar_func(i),"
                                   "varchar_func(j),"
                                   "varbinary_func(k),"
                                   "decimal_func(l),"
                                   "decimal_func(m),"
                                   "date_func(n),"
                                   "time_func(o),"
                                   "timestamp_func(p),"
                                   "array_str_func(q),"
                                   "array_timestamp_func(r),"
                                   "array_int_func(s),"
                                   "nested_array_func(t),"
                                   "row_func(u)"), "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["1,32767,-2147483648,1,true,false,1.0,1.0,hello,中文,"
                            "[102, 108, 105, 110, 107],1000000000000000000.050000000000000000,"
                            "1000000000000000000.059999999999999999,2014-09-13,01:00:01,"
                            "1970-01-02 00:00:00.123,[hello, 中文, null],[1970-01-02 00:00:00.123],"
                            "[1, 2],[hello, 中文, null],1,hello,1970-01-02 00:00:00.123,[1, 2]"])
Exemplo n.º 11
0
from pyflink.table import AggregateFunction, DataTypes
from pyflink.table.udf import udaf


class WeightedAvg(AggregateFunction):
    def create_accumulator(self):
        # Row(sum, count)
        return Row(0, 0)

    def get_value(self, accumulator: Row) -> float:
        if accumulator[1] == 0:
            return 0
        else:
            return accumulator[0] / accumulator[1]

    def accumulate(self, accumulator: Row, value, weight):
        accumulator[0] += value * weight
        accumulator[1] += weight

    def retract(self, accumulator: Row, value, weight):
        accumulator[0] -= value * weight
        accumulator[1] -= weight


weighted_avg = udaf(f=WeightedAvg(),
                    result_type=DataTypes.DOUBLE(),
                    accumulator_type=DataTypes.ROW([
                        DataTypes.FIELD("f0", DataTypes.BIGINT()),
                        DataTypes.FIELD("f1", DataTypes.BIGINT())
                    ]))
Exemplo n.º 12
0
 def test_non_exist_udf_type(self):
     with self.assertRaisesRegex(ValueError,
                                 'The udf_type must be one of \'general, pandas\''):
         udf(lambda i: i + 1, result_type=DataTypes.BIGINT(), udf_type="non-exist")
Exemplo n.º 13
0
        result = self.collect(t)
        self.assert_equals(result, ["1,3,6,3", "3,2,14,5"])


class BlinkBatchPandasUDFITTests(PandasUDFITTests,
                                 BlinkPandasUDFITTests,
                                 PyFlinkBlinkBatchTableTestCase):
    pass


class BlinkStreamPandasUDFITTests(PandasUDFITTests,
                                  BlinkPandasUDFITTests,
                                  PyFlinkBlinkStreamTableTestCase):
    pass


@udf(result_type=DataTypes.BIGINT(), udf_type='pandas')
def add(i, j):
    return i + j


if __name__ == '__main__':
    import unittest

    try:
        import xmlrunner
        testRunner = xmlrunner.XMLTestRunner(output='target/test-reports')
    except ImportError:
        testRunner = None
    unittest.main(testRunner=testRunner, verbosity=2)
Exemplo n.º 14
0
def pv_uv_demo():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("user_behavior")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
        ) \
        .with_format(  # declare a format for this system
            Json()
            .fail_on_missing_field(True)
            .json_schema(
                "{"
                "  type: 'object',"
                "  properties: {"
                "    user_id: {"
                "      type: 'string'"
                "    },"
                "    item_id: {"
                "      type: 'string'"
                "    },"
                "    category_id: {"
                "      type: 'string'"
                "    },"
                "    behavior: {"
                "      type: 'string'"
                "    },"
                "    ts: {"
                "      type: 'string',"
                "      format: 'date-time'"
                "    }"
                "  }"
                "}"
            )
        ) \
        .with_schema(  # declare the schema of the table
            Schema()
            .field("user_id", DataTypes.STRING())
            .field("item_id", DataTypes.STRING())
            .field("category_id", DataTypes.STRING())
            .field("behavior", DataTypes.STRING())
            .field("rowtime", DataTypes.TIMESTAMP())
            .rowtime(
                Rowtime()
                .timestamps_from_field("ts")
                .watermarks_periodic_bounded(60000))
         ) \
        .in_append_mode() \
        .register_table_source("source")

    # use custom retract sink connector
    custom_connector = CustomConnectorDescriptor('jdbc', 1, False) \
        .property("connector.driver", "org.apache.derby.jdbc.ClientDriver") \
        .property("connector.url", "jdbc:derby://localhost:1527/firstdb") \
        .property("connector.table", "pv_uv_table") \
        .property("connector.write.flush.max-rows", "1")
    st_env.connect(custom_connector) \
        .with_schema(
        Schema()
            .field("startTime", DataTypes.TIMESTAMP())
            .field("endTime", DataTypes.TIMESTAMP())
            .field("pv", DataTypes.BIGINT())
            .field("uv", DataTypes.BIGINT())
    ).register_table_sink("sink")

    st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \
        .group_by("w") \
        .select("w.start as startTime, w.end as endTime, COUNT(1) as pv, user_id.count.distinct as uv").insert_into("sink")

    st_env.execute("table pv uv")
Exemplo n.º 15
0
    def test_expression(self):
        expr1 = col('a')
        expr2 = col('b')
        expr3 = col('c')
        expr4 = col('d')
        expr5 = lit(10)

        # comparison functions
        self.assertEqual('equals(a, b)', str(expr1 == expr2))
        self.assertEqual('mod(2, b)', str(2 % expr2))
        self.assertEqual('notEquals(a, b)', str(expr1 != expr2))
        self.assertEqual('lessThan(a, b)', str(expr1 < expr2))
        self.assertEqual('lessThanOrEqual(a, b)', str(expr1 <= expr2))
        self.assertEqual('greaterThan(a, b)', str(expr1 > expr2))
        self.assertEqual('greaterThanOrEqual(a, b)', str(expr1 >= expr2))

        # logic functions
        self.assertEqual('and(a, b)', str(expr1 & expr2))
        self.assertEqual('or(a, b)', str(expr1 | expr2))
        self.assertEqual('isNotTrue(a)', str(expr1.is_not_true))
        self.assertEqual('isNotTrue(a)', str(~expr1))

        # arithmetic functions
        self.assertEqual('plus(a, b)', str(expr1 + expr2))
        self.assertEqual('plus(2, b)', str(2 + expr2))
        self.assertEqual('plus(cast(b, DATE), 2)', str(expr2.to_date + 2))
        self.assertEqual('minus(a, b)', str(expr1 - expr2))
        self.assertEqual('minus(cast(b, DATE), 2)', str(expr2.to_date - 2))
        self.assertEqual('times(a, b)', str(expr1 * expr2))
        self.assertEqual('divide(a, b)', str(expr1 / expr2))
        self.assertEqual('mod(a, b)', str(expr1 % expr2))
        self.assertEqual('power(a, b)', str(expr1**expr2))
        self.assertEqual('minusPrefix(a)', str(-expr1))

        self.assertEqual('exp(a)', str(expr1.exp))
        self.assertEqual('log10(a)', str(expr1.log10))
        self.assertEqual('log2(a)', str(expr1.log2))
        self.assertEqual('ln(a)', str(expr1.ln))
        self.assertEqual('log(a)', str(expr1.log()))
        self.assertEqual('cosh(a)', str(expr1.cosh))
        self.assertEqual('sinh(a)', str(expr1.sinh))
        self.assertEqual('sin(a)', str(expr1.sin))
        self.assertEqual('cos(a)', str(expr1.cos))
        self.assertEqual('tan(a)', str(expr1.tan))
        self.assertEqual('cot(a)', str(expr1.cot))
        self.assertEqual('asin(a)', str(expr1.asin))
        self.assertEqual('acos(a)', str(expr1.acos))
        self.assertEqual('atan(a)', str(expr1.atan))
        self.assertEqual('tanh(a)', str(expr1.tanh))
        self.assertEqual('degrees(a)', str(expr1.degrees))
        self.assertEqual('radians(a)', str(expr1.radians))
        self.assertEqual('sqrt(a)', str(expr1.sqrt))
        self.assertEqual('abs(a)', str(expr1.abs))
        self.assertEqual('abs(a)', str(abs(expr1)))
        self.assertEqual('sign(a)', str(expr1.sign))
        self.assertEqual('round(a, b)', str(expr1.round(expr2)))
        self.assertEqual('between(a, b, c)', str(expr1.between(expr2, expr3)))
        self.assertEqual('notBetween(a, b, c)',
                         str(expr1.not_between(expr2, expr3)))
        self.assertEqual('ifThenElse(a, b, c)', str(expr1.then(expr2, expr3)))

        self.assertEqual('isNull(a)', str(expr1.is_null))
        self.assertEqual('isNotNull(a)', str(expr1.is_not_null))
        self.assertEqual('isTrue(a)', str(expr1.is_true))
        self.assertEqual('isFalse(a)', str(expr1.is_false))
        self.assertEqual('isNotTrue(a)', str(expr1.is_not_true))
        self.assertEqual('isNotFalse(a)', str(expr1.is_not_false))
        self.assertEqual('distinct(a)', str(expr1.distinct))
        self.assertEqual('sum(a)', str(expr1.sum))
        self.assertEqual('sum0(a)', str(expr1.sum0))
        self.assertEqual('min(a)', str(expr1.min))
        self.assertEqual('max(a)', str(expr1.max))
        self.assertEqual('count(a)', str(expr1.count))
        self.assertEqual('avg(a)', str(expr1.avg))
        self.assertEqual('stddevPop(a)', str(expr1.stddev_pop))
        self.assertEqual('stddevSamp(a)', str(expr1.stddev_samp))
        self.assertEqual('varPop(a)', str(expr1.var_pop))
        self.assertEqual('varSamp(a)', str(expr1.var_samp))
        self.assertEqual('collect(a)', str(expr1.collect))
        self.assertEqual("as(a, 'a', 'b', 'c')",
                         str(expr1.alias('a', 'b', 'c')))
        self.assertEqual('cast(a, INT)', str(expr1.cast(DataTypes.INT())))
        self.assertEqual('asc(a)', str(expr1.asc))
        self.assertEqual('desc(a)', str(expr1.desc))
        self.assertEqual('in(a, b, c, d)', str(expr1.in_(expr2, expr3, expr4)))
        self.assertEqual('start(a)', str(expr1.start))
        self.assertEqual('end(a)', str(expr1.end))
        self.assertEqual('bin(a)', str(expr1.bin))
        self.assertEqual('hex(a)', str(expr1.hex))
        self.assertEqual('truncate(a, 3)', str(expr1.truncate(3)))

        # string functions
        self.assertEqual('substring(a, b, 3)', str(expr1.substring(expr2, 3)))
        self.assertEqual("trim(true, false, ' ', a)",
                         str(expr1.trim_leading()))
        self.assertEqual("trim(false, true, ' ', a)",
                         str(expr1.trim_trailing()))
        self.assertEqual("trim(true, true, ' ', a)", str(expr1.trim()))
        self.assertEqual('replace(a, b, c)', str(expr1.replace(expr2, expr3)))
        self.assertEqual('charLength(a)', str(expr1.char_length))
        self.assertEqual('upper(a)', str(expr1.upper_case))
        self.assertEqual('lower(a)', str(expr1.lower_case))
        self.assertEqual('initCap(a)', str(expr1.init_cap))
        self.assertEqual("like(a, 'Jo_n%')", str(expr1.like('Jo_n%')))
        self.assertEqual("similar(a, 'A+')", str(expr1.similar('A+')))
        self.assertEqual('position(a, b)', str(expr1.position(expr2)))
        self.assertEqual('lpad(a, 4, b)', str(expr1.lpad(4, expr2)))
        self.assertEqual('rpad(a, 4, b)', str(expr1.rpad(4, expr2)))
        self.assertEqual('overlay(a, b, 6, 2)', str(expr1.overlay(expr2, 6,
                                                                  2)))
        self.assertEqual("regexpReplace(a, b, 'abc')",
                         str(expr1.regexp_replace(expr2, 'abc')))
        self.assertEqual('regexpExtract(a, b, 3)',
                         str(expr1.regexp_extract(expr2, 3)))
        self.assertEqual('fromBase64(a)', str(expr1.from_base64))
        self.assertEqual('toBase64(a)', str(expr1.to_base64))
        self.assertEqual('ltrim(a)', str(expr1.ltrim))
        self.assertEqual('rtrim(a)', str(expr1.rtrim))
        self.assertEqual('repeat(a, 3)', str(expr1.repeat(3)))
        self.assertEqual("over(a, 'w')", str(expr1.over('w')))

        # temporal functions
        self.assertEqual('cast(a, DATE)', str(expr1.to_date))
        self.assertEqual('cast(a, TIME(0))', str(expr1.to_time))
        self.assertEqual('cast(a, TIMESTAMP(3))', str(expr1.to_timestamp))
        self.assertEqual('extract(YEAR, a)',
                         str(expr1.extract(TimeIntervalUnit.YEAR)))
        self.assertEqual('floor(a, YEAR)',
                         str(expr1.floor(TimeIntervalUnit.YEAR)))
        self.assertEqual('ceil(a)', str(expr1.ceil()))

        # advanced type helper functions
        self.assertEqual("get(a, 'col')", str(expr1.get('col')))
        self.assertEqual('flatten(a)', str(expr1.flatten))
        self.assertEqual('at(a, 0)', str(expr1.at(0)))
        self.assertEqual('cardinality(a)', str(expr1.cardinality))
        self.assertEqual('element(a)', str(expr1.element))

        # time definition functions
        self.assertEqual('rowtime(a)', str(expr1.rowtime))
        self.assertEqual('proctime(a)', str(expr1.proctime))
        self.assertEqual('120', str(expr5.year))
        self.assertEqual('120', str(expr5.years))
        self.assertEqual('30', str(expr5.quarter))
        self.assertEqual('30', str(expr5.quarters))
        self.assertEqual('10', str(expr5.month))
        self.assertEqual('10', str(expr5.months))
        self.assertEqual('6048000000', str(expr5.week))
        self.assertEqual('6048000000', str(expr5.weeks))
        self.assertEqual('864000000', str(expr5.day))
        self.assertEqual('864000000', str(expr5.days))
        self.assertEqual('36000000', str(expr5.hour))
        self.assertEqual('36000000', str(expr5.hours))
        self.assertEqual('600000', str(expr5.minute))
        self.assertEqual('600000', str(expr5.minutes))
        self.assertEqual('10000', str(expr5.second))
        self.assertEqual('10000', str(expr5.seconds))
        self.assertEqual('10', str(expr5.milli))
        self.assertEqual('10', str(expr5.millis))

        # hash functions
        self.assertEqual('md5(a)', str(expr1.md5))
        self.assertEqual('sha1(a)', str(expr1.sha1))
        self.assertEqual('sha224(a)', str(expr1.sha224))
        self.assertEqual('sha256(a)', str(expr1.sha256))
        self.assertEqual('sha384(a)', str(expr1.sha384))
        self.assertEqual('sha512(a)', str(expr1.sha512))
        self.assertEqual('sha2(a, 224)', str(expr1.sha2(224)))

        # json functions
        self.assertEqual("IS_JSON('42')", str(lit('42').is_json()))
        self.assertEqual("IS_JSON('42', SCALAR)",
                         str(lit('42').is_json(JsonType.SCALAR)))

        self.assertEqual("JSON_EXISTS('{}', '$.x')",
                         str(lit('{}').json_exists('$.x')))
        self.assertEqual(
            "JSON_EXISTS('{}', '$.x', FALSE)",
            str(lit('{}').json_exists('$.x', JsonExistsOnError.FALSE)))

        self.assertEqual(
            "JSON_VALUE('{}', '$.x', STRING, NULL, null, NULL, null)",
            str(lit('{}').json_value('$.x')))
        self.assertEqual(
            "JSON_VALUE('{}', '$.x', INT, DEFAULT, 42, ERROR, null)",
            str(
                lit('{}').json_value('$.x', DataTypes.INT(),
                                     JsonValueOnEmptyOrError.DEFAULT, 42,
                                     JsonValueOnEmptyOrError.ERROR, None)))

        self.assertEqual(
            "JSON_QUERY('{}', '$.x', WITHOUT_ARRAY, NULL, EMPTY_ARRAY)",
            str(
                lit('{}').json_query('$.x', JsonQueryWrapper.WITHOUT_ARRAY,
                                     JsonQueryOnEmptyOrError.NULL,
                                     JsonQueryOnEmptyOrError.EMPTY_ARRAY)))
Exemplo n.º 16
0
from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem

exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(2)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)

t_env.connect(FileSystem().path('input')) \
    .with_format(OldCsv()
                 .line_delimiter(' ')
                 .field('word', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())) \
    .register_table_source("inputSource")

t_env.connect(FileSystem().path('output')) \
    .with_format(OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\
    .with_schema(Schema().field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\
    .register_table_sink('sink')

t_env.scan('inputSource').group_by('word').select('word, count(1)').insert_into('sink')

t_env.execute('my first job')
Exemplo n.º 17
0
 def get_result_type(self):
     return DataTypes.STRING()
Exemplo n.º 18
0
 def get_accumulator_type(self):
     return DataTypes.ROW([DataTypes.FIELD("f0", DataTypes.LIST_VIEW(DataTypes.STRING()))])
Exemplo n.º 19
0
    def test_sliding_group_window_over_time(self):
        # create source file path
        tmp_dir = self.tempdir
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:30:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
        ]
        source_path = tmp_dir + '/test_sliding_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        self.t_env.create_temporary_system_function("my_sum",
                                                    SumAggregateFunction())

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c INT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        from pyflink.testing import source_sink_utils
        table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c', 'd'], [
            DataTypes.TINYINT(),
            DataTypes.TIMESTAMP(3),
            DataTypes.TIMESTAMP(3),
            DataTypes.BIGINT()
        ])
        self.t_env.register_table_sink("Results", table_sink)
        t.window(Slide.over(lit(1).hours)
                 .every(lit(30).minutes)
                 .on(t.rowtime)
                 .alias("w")) \
            .group_by(t.a, col("w")) \
            .select(t.a, col("w").start, col("w").end, call("my_sum", t.c).alias("c")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "+I[1, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2]",
            "+I[2, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 1]",
            "+I[3, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2]",
            "+I[1, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 5]",
            "+I[3, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2]",
            "+I[2, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2]",
            "+I[2, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 1]",
            "+I[1, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 11]",
            "+I[1, 2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 8]"
        ])
Exemplo n.º 20
0
 def create_another_table_schema():
     return TableSchema(["first2", "second", "third"],
                        [DataTypes.STRING(), DataTypes.STRING(), DataTypes.STRING()])
Exemplo n.º 21
0
 def get_result_type(self):
     return DataTypes.BIGINT()
Exemplo n.º 22
0
    def test_scalar_function(self):
        # test metric disabled.
        self.t_env.get_config().get_configuration().set_string(
            'python.metric.enabled', 'false')
        # test lambda function
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())

        # test Python ScalarFunction
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        # test callable function
        add_one_callable = udf(CallablePlus(), result_type=DataTypes.BIGINT())

        def partial_func(col, param):
            return col + param

        # test partial function
        import functools
        add_one_partial = udf(functools.partial(partial_func, param=1),
                              result_type=DataTypes.BIGINT())

        # check memory limit is set
        @udf(result_type=DataTypes.BIGINT())
        def check_memory_limit(exec_mode):
            if exec_mode == "process":
                assert os.environ['_PYTHON_WORKER_MEMORY_LIMIT'] is not None
            return 1

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e', 'f', 'g'], [
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT()
            ])
        self.t_env.register_table_sink("Results", table_sink)

        execution_mode = self.t_env.get_config().get_configuration(
        ).get_string("python.execution-mode", "process")

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)],
                                     ['a', 'b', 'c'])
        t.where(add_one(t.b) <= 3).select(
            add_one(t.a), subtract_one(t.b), add(t.a, t.c), add_one_callable(t.a),
            add_one_partial(t.a), check_memory_limit(execution_mode), t.a) \
            .execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(
            actual, ["+I[2, 1, 4, 2, 2, 1, 1]", "+I[4, 0, 12, 4, 4, 1, 3]"])
Exemplo n.º 23
0
    def test_udf_with_constant_params(self):
        def udf_with_constant_params(p, null_param, tinyint_param,
                                     smallint_param, int_param, bigint_param,
                                     decimal_param, float_param, double_param,
                                     boolean_param, str_param, date_param,
                                     time_param, timestamp_param):

            from decimal import Decimal
            import datetime

            assert null_param is None, 'null_param is wrong value %s' % null_param

            assert isinstance(tinyint_param, int), 'tinyint_param of wrong type %s !' \
                                                   % type(tinyint_param)
            p += tinyint_param
            assert isinstance(smallint_param, int), 'smallint_param of wrong type %s !' \
                                                    % type(smallint_param)
            p += smallint_param
            assert isinstance(int_param, int), 'int_param of wrong type %s !' \
                                               % type(int_param)
            p += int_param
            assert isinstance(bigint_param, int), 'bigint_param of wrong type %s !' \
                                                  % type(bigint_param)
            p += bigint_param
            assert decimal_param == Decimal('1.05'), \
                'decimal_param is wrong value %s ' % decimal_param

            p += int(decimal_param)

            assert isinstance(float_param, float) and float_equal(float_param, 1.23, 1e-06), \
                'float_param is wrong value %s ' % float_param

            p += int(float_param)
            assert isinstance(double_param, float) and float_equal(double_param, 1.98932, 1e-07), \
                'double_param is wrong value %s ' % double_param

            p += int(double_param)

            assert boolean_param is True, 'boolean_param is wrong value %s' % boolean_param

            assert str_param == 'flink', 'str_param is wrong value %s' % str_param

            assert date_param == datetime.date(year=2014, month=9, day=13), \
                'date_param is wrong value %s' % date_param

            assert time_param == datetime.time(hour=12, minute=0, second=0), \
                'time_param is wrong value %s' % time_param

            assert timestamp_param == datetime.datetime(1999, 9, 10, 5, 20, 10), \
                'timestamp_param is wrong value %s' % timestamp_param

            return p

        self.t_env.register_function(
            "udf_with_constant_params",
            udf(udf_with_constant_params,
                input_types=[
                    DataTypes.BIGINT(),
                    DataTypes.BIGINT(),
                    DataTypes.TINYINT(),
                    DataTypes.SMALLINT(),
                    DataTypes.INT(),
                    DataTypes.BIGINT(),
                    DataTypes.DECIMAL(38, 18),
                    DataTypes.FLOAT(),
                    DataTypes.DOUBLE(),
                    DataTypes.BOOLEAN(),
                    DataTypes.STRING(),
                    DataTypes.DATE(),
                    DataTypes.TIME(),
                    DataTypes.TIMESTAMP(3)
                ],
                result_type=DataTypes.BIGINT()))

        self.t_env.register_function(
            "udf_with_all_constant_params",
            udf(lambda i, j: i + j,
                [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)],
                                     ['a', 'b', 'c'])
        self.t_env.register_table("test_table", t)
        self.t_env.sql_query("select udf_with_all_constant_params("
                             "cast (1 as BIGINT),"
                             "cast (2 as BIGINT)), "
                             "udf_with_constant_params(a, "
                             "cast (null as BIGINT),"
                             "cast (1 as TINYINT),"
                             "cast (1 as SMALLINT),"
                             "cast (1 as INT),"
                             "cast (1 as BIGINT),"
                             "cast (1.05 as DECIMAL),"
                             "cast (1.23 as FLOAT),"
                             "cast (1.98932 as DOUBLE),"
                             "true,"
                             "'flink',"
                             "cast ('2014-09-13' as DATE),"
                             "cast ('12:00:00' as TIME),"
                             "cast ('1999-9-10 05:20:10' as TIMESTAMP))"
                             " from test_table").insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["3,8", "3,9", "3,10"])
from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic
from pyflink.table import StreamTableEnvironment, DataTypes, EnvironmentSettings
from pyflink.table.udf import udf

provinces = ("Beijing", "Shanghai", "Hangzhou", "Shenzhen", "Jiangxi",
             "Chongqing", "Xizang")


@udf(input_types=[DataTypes.STRING()], result_type=DataTypes.STRING())
def province_id_to_name(id):
    return provinces[id]


def log_processing():
    env = StreamExecutionEnvironment.get_execution_environment()
    env_settings = EnvironmentSettings.Builder().use_blink_planner().build()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env,
                                          environment_settings=env_settings)
    t_env.get_config().get_configuration().set_boolean(
        "python.fn-execution.memory.managed", True)

    source_ddl = """
            CREATE TABLE payment_msg(
                createTime VARCHAR,
                orderId BIGINT,
                payAmount DOUBLE,
                payPlatform INT,
                provinceId INT
            ) WITH (
              'connector.type' = 'kafka',
              'connector.version' = 'universal',
Exemplo n.º 25
0
    def test_all_data_types(self):
        def boolean_func(bool_param):
            assert isinstance(bool_param, bool), 'bool_param of wrong type %s !' \
                                                 % type(bool_param)
            return bool_param

        def tinyint_func(tinyint_param):
            assert isinstance(tinyint_param, int), 'tinyint_param of wrong type %s !' \
                                                   % type(tinyint_param)
            return tinyint_param

        def smallint_func(smallint_param):
            assert isinstance(smallint_param, int), 'smallint_param of wrong type %s !' \
                                                    % type(smallint_param)
            assert smallint_param == 32767, 'smallint_param of wrong value %s' % smallint_param
            return smallint_param

        def int_func(int_param):
            assert isinstance(int_param, int), 'int_param of wrong type %s !' \
                                               % type(int_param)
            assert int_param == -2147483648, 'int_param of wrong value %s' % int_param
            return int_param

        def bigint_func(bigint_param):
            assert isinstance(bigint_param, int), 'bigint_param of wrong type %s !' \
                                                  % type(bigint_param)
            return bigint_param

        def bigint_func_none(bigint_param):
            assert bigint_param is None, 'bigint_param %s should be None!' % bigint_param
            return bigint_param

        def float_func(float_param):
            assert isinstance(float_param, float) and float_equal(float_param, 1.23, 1e-6), \
                'float_param is wrong value %s !' % float_param
            return float_param

        def double_func(double_param):
            assert isinstance(double_param, float) and float_equal(double_param, 1.98932, 1e-7), \
                'double_param is wrong value %s !' % double_param
            return double_param

        def bytes_func(bytes_param):
            assert bytes_param == b'flink', \
                'bytes_param is wrong value %s !' % bytes_param
            return bytes_param

        def str_func(str_param):
            assert str_param == 'pyflink', \
                'str_param is wrong value %s !' % str_param
            return str_param

        def date_func(date_param):
            from datetime import date
            assert date_param == date(year=2014, month=9, day=13), \
                'date_param is wrong value %s !' % date_param
            return date_param

        def time_func(time_param):
            from datetime import time
            assert time_param == time(hour=12, minute=0, second=0, microsecond=123000), \
                'time_param is wrong value %s !' % time_param
            return time_param

        def timestamp_func(timestamp_param):
            from datetime import datetime
            assert timestamp_param == datetime(2018, 3, 11, 3, 0, 0, 123000), \
                'timestamp_param is wrong value %s !' % timestamp_param
            return timestamp_param

        def array_func(array_param):
            assert array_param == [[1, 2, 3]], \
                'array_param is wrong value %s !' % array_param
            return array_param[0]

        def map_func(map_param):
            assert map_param == {1: 'flink', 2: 'pyflink'}, \
                'map_param is wrong value %s !' % map_param
            return map_param

        def decimal_func(decimal_param):
            from decimal import Decimal
            assert decimal_param == Decimal('1000000000000000000.050000000000000000'), \
                'decimal_param is wrong value %s !' % decimal_param
            return decimal_param

        def decimal_cut_func(decimal_param):
            from decimal import Decimal
            assert decimal_param == Decimal('1000000000000000000.059999999999999999'), \
                'decimal_param is wrong value %s !' % decimal_param
            return decimal_param

        self.t_env.register_function(
            "boolean_func",
            udf(boolean_func, [DataTypes.BOOLEAN()], DataTypes.BOOLEAN()))

        self.t_env.register_function(
            "tinyint_func",
            udf(tinyint_func, [DataTypes.TINYINT()], DataTypes.TINYINT()))

        self.t_env.register_function(
            "smallint_func",
            udf(smallint_func, [DataTypes.SMALLINT()], DataTypes.SMALLINT()))

        self.t_env.register_function(
            "int_func", udf(int_func, [DataTypes.INT()], DataTypes.INT()))

        self.t_env.register_function(
            "bigint_func",
            udf(bigint_func, [DataTypes.BIGINT()], DataTypes.BIGINT()))

        self.t_env.register_function(
            "bigint_func_none",
            udf(bigint_func_none, [DataTypes.BIGINT()], DataTypes.BIGINT()))

        self.t_env.register_function(
            "float_func",
            udf(float_func, [DataTypes.FLOAT()], DataTypes.FLOAT()))

        self.t_env.register_function(
            "double_func",
            udf(double_func, [DataTypes.DOUBLE()], DataTypes.DOUBLE()))

        self.t_env.register_function(
            "bytes_func",
            udf(bytes_func, [DataTypes.BYTES()], DataTypes.BYTES()))

        self.t_env.register_function(
            "str_func", udf(str_func, [DataTypes.STRING()],
                            DataTypes.STRING()))

        self.t_env.register_function(
            "date_func", udf(date_func, [DataTypes.DATE()], DataTypes.DATE()))

        self.t_env.register_function(
            "time_func", udf(time_func, [DataTypes.TIME()], DataTypes.TIME()))

        self.t_env.register_function(
            "timestamp_func",
            udf(timestamp_func, [DataTypes.TIMESTAMP(3)],
                DataTypes.TIMESTAMP(3)))

        self.t_env.register_function(
            "array_func",
            udf(array_func,
                [DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT()))],
                DataTypes.ARRAY(DataTypes.BIGINT())))

        self.t_env.register_function(
            "map_func",
            udf(map_func,
                [DataTypes.MAP(DataTypes.BIGINT(), DataTypes.STRING())],
                DataTypes.MAP(DataTypes.BIGINT(), DataTypes.STRING())))

        self.t_env.register_function(
            "decimal_func",
            udf(decimal_func, [DataTypes.DECIMAL(38, 18)],
                DataTypes.DECIMAL(38, 18)))

        self.t_env.register_function(
            "decimal_cut_func",
            udf(decimal_cut_func, [DataTypes.DECIMAL(38, 18)],
                DataTypes.DECIMAL(38, 18)))

        table_sink = source_sink_utils.TestAppendSink([
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q'
        ], [
            DataTypes.BIGINT(),
            DataTypes.BIGINT(),
            DataTypes.TINYINT(),
            DataTypes.BOOLEAN(),
            DataTypes.SMALLINT(),
            DataTypes.INT(),
            DataTypes.FLOAT(),
            DataTypes.DOUBLE(),
            DataTypes.BYTES(),
            DataTypes.STRING(),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(3),
            DataTypes.ARRAY(DataTypes.BIGINT()),
            DataTypes.MAP(DataTypes.BIGINT(), DataTypes.STRING()),
            DataTypes.DECIMAL(38, 18),
            DataTypes.DECIMAL(38, 18)
        ])
        self.t_env.register_table_sink("Results", table_sink)

        import datetime
        import decimal
        t = self.t_env.from_elements(
            [(1, None, 1, True, 32767, -2147483648, 1.23, 1.98932,
              bytearray(b'flink'), 'pyflink', datetime.date(2014, 9, 13),
              datetime.time(hour=12, minute=0, second=0, microsecond=123000),
              datetime.datetime(2018, 3, 11, 3, 0, 0, 123000), [[1, 2, 3]], {
                  1: 'flink',
                  2: 'pyflink'
              }, decimal.Decimal('1000000000000000000.05'),
              decimal.Decimal(
                  '1000000000000000000.05999999999999999899999999999'))],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.BIGINT()),
                DataTypes.FIELD("b", DataTypes.BIGINT()),
                DataTypes.FIELD("c", DataTypes.TINYINT()),
                DataTypes.FIELD("d", DataTypes.BOOLEAN()),
                DataTypes.FIELD("e", DataTypes.SMALLINT()),
                DataTypes.FIELD("f", DataTypes.INT()),
                DataTypes.FIELD("g", DataTypes.FLOAT()),
                DataTypes.FIELD("h", DataTypes.DOUBLE()),
                DataTypes.FIELD("i", DataTypes.BYTES()),
                DataTypes.FIELD("j", DataTypes.STRING()),
                DataTypes.FIELD("k", DataTypes.DATE()),
                DataTypes.FIELD("l", DataTypes.TIME()),
                DataTypes.FIELD("m", DataTypes.TIMESTAMP(3)),
                DataTypes.FIELD(
                    "n", DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT()))),
                DataTypes.FIELD(
                    "o", DataTypes.MAP(DataTypes.BIGINT(),
                                       DataTypes.STRING())),
                DataTypes.FIELD("p", DataTypes.DECIMAL(38, 18)),
                DataTypes.FIELD("q", DataTypes.DECIMAL(38, 18))
            ]))

        t.select("bigint_func(a), bigint_func_none(b),"
                 "tinyint_func(c), boolean_func(d),"
                 "smallint_func(e),int_func(f),"
                 "float_func(g),double_func(h),"
                 "bytes_func(i),str_func(j),"
                 "date_func(k),time_func(l),"
                 "timestamp_func(m),array_func(n),"
                 "map_func(o),decimal_func(p),"
                 "decimal_cut_func(q)") \
            .insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        # Currently the sink result precision of DataTypes.TIME(precision) only supports 0.
        self.assert_equals(actual, [
            "1,null,1,true,32767,-2147483648,1.23,1.98932,"
            "[102, 108, 105, 110, 107],pyflink,2014-09-13,"
            "12:00:00,2018-03-11 03:00:00.123,[1, 2, 3],"
            "{1=flink, 2=pyflink},1000000000000000000.050000000000000000,"
            "1000000000000000000.059999999999999999"
        ])
Exemplo n.º 26
0
def custom_kafka_source_demo():
    custom_connector = CustomConnectorDescriptor('kafka', 1, True) \
        .property('connector.topic', 'user') \
        .property('connector.properties.0.key', 'zookeeper.connect') \
        .property('connector.properties.0.value', 'localhost:2181') \
        .property('connector.properties.1.key', 'bootstrap.servers') \
        .property('connector.properties.1.value', 'localhost:9092') \
        .properties({'connector.version': '0.11', 'connector.startup-mode': 'earliest-offset'})

    # the key is 'format.json-schema'
    custom_format = CustomFormatDescriptor('json', 1) \
        .property('format.json-schema',
                  "{"
                  "  type: 'object',"
                  "  properties: {"
                  "    a: {"
                  "      type: 'string'"
                  "    },"
                  "    b: {"
                  "      type: 'string'"
                  "    },"
                  "    c: {"
                  "      type: 'string'"
                  "    },"
                  "    time: {"
                  "      type: 'string',"
                  "      format: 'date-time'"
                  "    }"
                  "  }"
                  "}") \
        .properties({'format.fail-on-missing-field': 'true'})

    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime)
    st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/custom_kafka_source_demo.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    st_env \
        .connect(custom_connector) \
        .with_format(
            custom_format
        ) \
        .with_schema(  # declare the schema of the table
            Schema()
            .field("proctime", DataTypes.TIMESTAMP())
            .proctime()
            .field("a", DataTypes.STRING())
            .field("b", DataTypes.STRING())
            .field("c", DataTypes.STRING())
         ) \
        .in_append_mode() \
        .register_table_source("source")

    st_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b"],
            [DataTypes.STRING(), DataTypes.STRING()], result_file))

    st_env.scan("source").window(Tumble.over("2.rows").on("proctime").alias("w")) \
        .group_by("w, a") \
        .select("a, max(b)").insert_into("result")

    st_env.execute("custom kafka source demo")
Exemplo n.º 27
0
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes
from pyflink.table.udf import udf

# https://flink.apache.org/2020/04/09/pyflink-udf-support-flink.html
# https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/python/table-api-users-guide/udfs/python_udfs.html

@udf(input_types=[DataTypes.INT(), DataTypes.INT()], result_type=DataTypes.BIGINT(), func_type="pandas")
def add(i, j):
  return i + j

env = StreamExecutionEnvironment.get_execution_environment()
t_env = StreamTableEnvironment.create(env)

t_env.create_temporary_function("add", add)

my_source_ddl = """
create table mySource (
    a INT,
    b INT
) with (
    'connector' = 'datagen',
    'rows-per-second' = '5'
)
"""

my_sink_ddl = """
create table mySink (
    c BIGINT
) with (
    'connector' = 'print'
Exemplo n.º 28
0
    def test_expressions(self):
        expr1 = col('a')
        expr2 = col('b')
        expr3 = col('c')

        self.assertEqual('10', str(lit(10, DataTypes.INT(False))))
        self.assertEqual('rangeTo(1, 2)', str(range_(1, 2)))
        self.assertEqual('and(a, b, c)', str(and_(expr1, expr2, expr3)))
        self.assertEqual('or(a, b, c)', str(or_(expr1, expr2, expr3)))

        from pyflink.table.expressions import UNBOUNDED_ROW, UNBOUNDED_RANGE, CURRENT_ROW, \
            CURRENT_RANGE
        self.assertEqual('unboundedRow()', str(UNBOUNDED_ROW))
        self.assertEqual('unboundedRange()', str(UNBOUNDED_RANGE))
        self.assertEqual('currentRow()', str(CURRENT_ROW))
        self.assertEqual('currentRange()', str(CURRENT_RANGE))

        self.assertEqual('currentDate()', str(current_date()))
        self.assertEqual('currentTime()', str(current_time()))
        self.assertEqual('currentTimestamp()', str(current_timestamp()))
        self.assertEqual('localTime()', str(local_time()))
        self.assertEqual('localTimestamp()', str(local_timestamp()))
        self.assertEquals('toTimestampLtz(123, 0)',
                          str(to_timestamp_ltz(123, 0)))
        self.assertEqual(
            "temporalOverlaps(cast('2:55:00', TIME(0)), 3600000, "
            "cast('3:30:00', TIME(0)), 7200000)",
            str(
                temporal_overlaps(
                    lit("2:55:00").to_time,
                    lit(1).hours,
                    lit("3:30:00").to_time,
                    lit(2).hours)))
        self.assertEqual("dateFormat(time, '%Y, %d %M')",
                         str(date_format(col("time"), "%Y, %d %M")))
        self.assertEqual(
            "timestampDiff(DAY, cast('2016-06-15', DATE), cast('2016-06-18', DATE))",
            str(
                timestamp_diff(TimePointUnit.DAY,
                               lit("2016-06-15").to_date,
                               lit("2016-06-18").to_date)))
        self.assertEqual('array(1, 2, 3)', str(array(1, 2, 3)))
        self.assertEqual("row('key1', 1)", str(row("key1", 1)))
        self.assertEqual("map('key1', 1, 'key2', 2, 'key3', 3)",
                         str(map_("key1", 1, "key2", 2, "key3", 3)))
        self.assertEqual('4', str(row_interval(4)))
        self.assertEqual('pi()', str(pi()))
        self.assertEqual('e()', str(e()))
        self.assertEqual('rand(4)', str(rand(4)))
        self.assertEqual('randInteger(4)', str(rand_integer(4)))
        self.assertEqual('atan2(1, 2)', str(atan2(1, 2)))
        self.assertEqual('minusPrefix(a)', str(negative(expr1)))
        self.assertEqual('concat(a, b, c)', str(concat(expr1, expr2, expr3)))
        self.assertEqual("concat_ws(', ', b, c)",
                         str(concat_ws(', ', expr2, expr3)))
        self.assertEqual('uuid()', str(uuid()))
        self.assertEqual('null', str(null_of(DataTypes.BIGINT())))
        self.assertEqual('log(a)', str(log(expr1)))
        self.assertEqual('ifThenElse(a, b, c)',
                         str(if_then_else(expr1, expr2, expr3)))
        self.assertEqual('withColumns(a, b, c)',
                         str(with_columns(expr1, expr2, expr3)))
        self.assertEqual('a.b.c(a)', str(call('a.b.c', expr1)))
Exemplo n.º 29
0
def register_rides_sink(st_env):
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("TempResults")
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
            DataTypes.FIELD("rideId", DataTypes.BIGINT()),
            DataTypes.FIELD("taxiId", DataTypes.BIGINT()),
            DataTypes.FIELD("isStart", DataTypes.BOOLEAN()),
            DataTypes.FIELD("lon", DataTypes.FLOAT()),
            DataTypes.FIELD("lat", DataTypes.FLOAT()),
            DataTypes.FIELD("psgCnt", DataTypes.INT()),
            DataTypes.FIELD("rideTime", DataTypes.TIMESTAMP())
        ]))) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("taxiId", DataTypes.BIGINT())
            .field("isStart", DataTypes.BOOLEAN())
            .field("lon", DataTypes.FLOAT())
            .field("lat", DataTypes.FLOAT())
            .field("psgCnt", DataTypes.INT())
            .field("rideTime", DataTypes.TIMESTAMP())) \
        .in_append_mode() \
        .register_table_sink("sink")
Exemplo n.º 30
0
# https://ci.apache.org/projects/flink/flink-docs-master/getting-started/walkthroughs/python_table_api.html

from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem

exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)

t_env.connect(FileSystem().path('/tmp/input')) \
    .with_format(OldCsv()
                 .field('word', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())) \
    .create_temporary_table('mySource')

t_env.connect(FileSystem().path('/tmp/output')) \
    .with_format(OldCsv()
                 .field_delimiter('\t')
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

t_env.from_path('mySource') \
    .group_by('word') \
    .select('word, count(1)') \