示例#1
0
    def test_from_data_stream(self):
        self.env.set_parallelism(1)

        ds = self.env.from_collection([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
                                      type_info=Types.ROW([Types.INT(),
                                                           Types.STRING(),
                                                           Types.STRING()]))
        t_env = self.t_env
        table = t_env.from_data_stream(ds)
        field_names = ['a', 'b', 'c']
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        t_env.register_table_sink("Sink",
                                  source_sink_utils.TestAppendSink(field_names, field_types))
        t_env.insert_into("Sink", table)
        t_env.execute("test_from_data_stream")
        result = source_sink_utils.results()
        expected = ['+I[1, Hi, Hello]', '+I[2, Hello, Hi]']
        self.assert_equals(result, expected)

        ds = ds.map(lambda x: x, Types.ROW([Types.INT(), Types.STRING(), Types.STRING()]))
        table = t_env.from_data_stream(ds, col('a'), col('b'), col('c'))
        t_env.register_table_sink("ExprSink",
                                  source_sink_utils.TestAppendSink(field_names, field_types))
        t_env.insert_into("ExprSink", table)
        t_env.execute("test_from_data_stream_with_expr")
        result = source_sink_utils.results()
        self.assert_equals(result, expected)
示例#2
0
    def test_sliding_group_window_over_proctime(self):
        self.t_env.get_config().set("parallelism.default", "1")
        from pyflink.table.window import Slide
        self.t_env.register_function("mean_udaf", mean_udaf)

        source_table = """
            create table source_table(
                a INT,
                proctime as PROCTIME()
            ) with(
                'connector' = 'datagen',
                'rows-per-second' = '1',
                'fields.a.kind' = 'sequence',
                'fields.a.start' = '1',
                'fields.a.end' = '10'
            )
        """
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")
        iterator = t.select(t.a, t.proctime) \
            .window(Slide.over(lit(1).seconds)
                    .every(lit(1).seconds)
                    .on(t.proctime)
                    .alias("w")) \
            .group_by(t.a, col("w")) \
            .select(mean_udaf(t.a).alias("b"), col("w").start).execute().collect()
        result = [i for i in iterator]
        # if the WindowAssigner.isEventTime() does not return false,
        # the w.start would be 1970-01-01
        # TODO: After fixing the TimeZone problem of window with processing time (will be fixed in
        # FLIP-162), we should replace it with a more accurate assertion.
        self.assertTrue(result[0][1].year > 1970)
示例#3
0
    def test_tumble_group_window_aggregate_function(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))
        sink_table_ddl = """
        CREATE TABLE Results(a TIMESTAMP(3), b TIMESTAMP(3), c FLOAT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        tumble_window = Tumble.over(lit(1).hours) \
            .on(col("rowtime")) \
            .alias("w")
        t.window(tumble_window) \
            .group_by(col("w")) \
            .select(col("w").start, col("w").end, mean_udaf(t.b)) \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[2018-03-11T03:00, 2018-03-11T04:00, 2.2]",
                            "+I[2018-03-11T04:00, 2018-03-11T05:00, 8.0]"])
示例#4
0
    def test_table_function(self):
        self._register_table_sink(
            ['a', 'b', 'c'],
            [DataTypes.BIGINT(),
             DataTypes.BIGINT(),
             DataTypes.BIGINT()])

        multi_emit = udtf(
            MultiEmit(), result_types=[DataTypes.BIGINT(),
                                       DataTypes.BIGINT()])
        multi_num = udf(MultiNum(), result_type=DataTypes.BIGINT())

        t = self.t_env.from_elements([(1, 1, 3), (2, 1, 6), (3, 2, 9)],
                                     ['a', 'b', 'c'])
        t = t.join_lateral(
            multi_emit((t.a + t.a) / 2, multi_num(t.b)).alias('x', 'y'))
        t = t.left_outer_join_lateral(condition_multi_emit(t.x, t.y).alias('m')) \
            .select(t.x, t.y, col("m"))
        t = t.left_outer_join_lateral(identity(t.m).alias('n')) \
            .select(t.x, t.y, col("n"))
        actual = self._get_output(t)
        self.assert_equals(actual, [
            "+I[1, 0, null]", "+I[1, 1, null]", "+I[2, 0, null]",
            "+I[2, 1, null]", "+I[3, 0, 0]", "+I[3, 0, 1]", "+I[3, 0, 2]",
            "+I[3, 1, 1]", "+I[3, 1, 2]", "+I[3, 2, 2]", "+I[3, 3, null]"
        ])
示例#5
0
    def test_tumble_window(self):
        t = self.t_env.from_elements([(1, 1, "Hello")], ["a", "b", "c"])
        result = t.window(Tumble.over(expr.row_interval(2)).on("a").alias("w"))\
            .group_by(expr.col('w'), expr.col('c')).select(t.b.sum)

        query_operation = result._j_table.getQueryOperation().getChildren().get(0)
        self.assertEqual('[c]', query_operation.getGroupingExpressions().toString())
        self.assertEqual('TumbleWindow(field: [a], size: [2])',
                         query_operation.getGroupWindow().asSummaryString())
示例#6
0
    def test_session_window(self):
        t = self.t_env.from_elements([(1000, 1, "Hello")], ["a", "b", "c"])
        result = t.window(Session.with_gap(expr.lit(1).seconds).on("a").alias("w"))\
            .group_by(expr.col('w'), expr.col('c')).select(t.b.sum)

        query_operation = result._j_table.getQueryOperation().getChildren().get(0)
        self.assertEqual('[c]', query_operation.getGroupingExpressions().toString())
        self.assertEqual('SessionWindow(field: [a], gap: [1000])',
                         query_operation.getGroupWindow().asSummaryString())
示例#7
0
    def test_session_group_window_over_time(self):
        # create source file path
        tmp_dir = self.tempdir
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
            '2,2,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_session_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        self.t_env.register_function("my_count", CountAggregateFunction())

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        from pyflink.testing import source_sink_utils
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd'],
            [
                DataTypes.TINYINT(),
                DataTypes.TIMESTAMP(3),
                DataTypes.TIMESTAMP(3),
                DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        t.window(Session.with_gap(lit(30).minutes).on(t.rowtime).alias("w")) \
            .group_by(t.a, t.b, col("w")) \
            .select(t.a, col("w").start, col("w").end, call("my_count", t.c).alias("c")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[3, 2018-03-11 03:10:00.0, 2018-03-11 03:40:00.0, 1]",
                            "+I[2, 2018-03-11 03:10:00.0, 2018-03-11 04:00:00.0, 2]",
                            "+I[1, 2018-03-11 03:10:00.0, 2018-03-11 04:10:00.0, 2]",
                            "+I[1, 2018-03-11 04:20:00.0, 2018-03-11 04:50:00.0, 1]"])
示例#8
0
    def test_expressions(self):
        expr1 = col('a')
        expr2 = col('b')
        expr3 = col('c')

        self.assertEqual('10', str(lit(10, DataTypes.INT(False))))
        self.assertEqual('rangeTo(1, 2)', str(range_(1, 2)))
        self.assertEqual('and(a, b, c)', str(and_(expr1, expr2, expr3)))
        self.assertEqual('or(a, b, c)', str(or_(expr1, expr2, expr3)))

        from pyflink.table.expressions import UNBOUNDED_ROW, UNBOUNDED_RANGE, CURRENT_ROW, \
            CURRENT_RANGE
        self.assertEqual('unboundedRow()', str(UNBOUNDED_ROW))
        self.assertEqual('unboundedRange()', str(UNBOUNDED_RANGE))
        self.assertEqual('currentRow()', str(CURRENT_ROW))
        self.assertEqual('currentRange()', str(CURRENT_RANGE))

        self.assertEqual('currentDate()', str(current_date()))
        self.assertEqual('currentTime()', str(current_time()))
        self.assertEqual('currentTimestamp()', str(current_timestamp()))
        self.assertEqual('localTime()', str(local_time()))
        self.assertEqual('localTimestamp()', str(local_timestamp()))
        self.assertEquals('toTimestampLtz(123, 0)', str(to_timestamp_ltz(123, 0)))
        self.assertEqual("temporalOverlaps(cast('2:55:00', TIME(0)), 3600000, "
                         "cast('3:30:00', TIME(0)), 7200000)",
                         str(temporal_overlaps(
                             lit("2:55:00").to_time,
                             lit(1).hours,
                             lit("3:30:00").to_time,
                             lit(2).hours)))
        self.assertEqual("dateFormat(time, '%Y, %d %M')",
                         str(date_format(col("time"), "%Y, %d %M")))
        self.assertEqual("timestampDiff(DAY, cast('2016-06-15', DATE), cast('2016-06-18', DATE))",
                         str(timestamp_diff(
                             TimePointUnit.DAY,
                             lit("2016-06-15").to_date,
                             lit("2016-06-18").to_date)))
        self.assertEqual('array(1, 2, 3)', str(array(1, 2, 3)))
        self.assertEqual("row('key1', 1)", str(row("key1", 1)))
        self.assertEqual("map('key1', 1, 'key2', 2, 'key3', 3)",
                         str(map_("key1", 1, "key2", 2, "key3", 3)))
        self.assertEqual('4', str(row_interval(4)))
        self.assertEqual('pi()', str(pi()))
        self.assertEqual('e()', str(e()))
        self.assertEqual('rand(4)', str(rand(4)))
        self.assertEqual('randInteger(4)', str(rand_integer(4)))
        self.assertEqual('atan2(1, 2)', str(atan2(1, 2)))
        self.assertEqual('minusPrefix(a)', str(negative(expr1)))
        self.assertEqual('concat(a, b, c)', str(concat(expr1, expr2, expr3)))
        self.assertEqual("concat_ws(', ', b, c)", str(concat_ws(', ', expr2, expr3)))
        self.assertEqual('uuid()', str(uuid()))
        self.assertEqual('null', str(null_of(DataTypes.BIGINT())))
        self.assertEqual('log(a)', str(log(expr1)))
        self.assertEqual('ifThenElse(a, b, c)', str(if_then_else(expr1, expr2, expr3)))
        self.assertEqual('withColumns(a, b, c)', str(with_columns(expr1, expr2, expr3)))
        self.assertEqual('a.b.c(a)', str(call('a.b.c', expr1)))
示例#9
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode())

    # used to test pipeline.jars and pipeline.classpaths
    config_key = sys.argv[1]
    config_value = sys.argv[2]
    t_env.get_config().get_configuration().set_string(config_key, config_value)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename, e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT,
            `count_java` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.execute_sql(sink_ddl)

    t_env.execute_sql("create temporary system function add_one as 'add_one.add_one' language python")
    t_env.register_java_function("add_one_java", "org.apache.flink.python.tests.util.AddOne")

    elements = [(word, 0) for word in content.split(" ")]
    t = t_env.from_elements(elements, ["word", "count"])
    t.select(t.word,
             call("add_one", t.count).alias("count"),
             call("add_one_java", t.count).alias("count_java")) \
        .group_by(t.word) \
        .select(t.word,
                col("count").count.alias("count"),
                col("count_java").count.alias("count_java")) \
        .execute_insert("Results")
示例#10
0
def demo01():
    # environment configuration
    t_env = BatchTableEnvironment.create(environment_settings=EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build())

    # register Orders table and Result table sink in table environment
    source_data_path = "/path/to/source/directory/"
    result_data_path = "/path/to/result/directory/"
    source_ddl = f"""
            create table Orders(
                a VARCHAR,
                b BIGINT,
                c BIGINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '1' SECOND
            ) with (
                'connector' = 'filesystem',
                'format' = 'csv',
                'path' = '{source_data_path}'
            )
            """
    t_env.execute_sql(source_ddl)

    sink_ddl = f"""
        create table `Result`(
            a VARCHAR,
            cnt BIGINT
        ) with (
            'connector' = 'filesystem',
            'format' = 'csv',
            'path' = '{result_data_path}'
        )
        """
    t_env.execute_sql(sink_ddl)

    # specify table program
    orders = t_env.from_path("Orders")  # schema (a, b, c, rowtime)

    orders.group_by("a").select(orders.a, orders.b.count.alias('cnt')).execute_insert("result").wait()

    orders.where(orders.a == 'red')
    orders.filter(orders.b % 2 == 0)
    orders.add_columns(concat(orders.c, 'sunny'))
    orders.add_or_replace_columns(concat(orders.c, 'sunny').alias('desc'))
    orders.drop_columns(orders.b, orders.c)
    orders.rename_columns(orders.b.alias('b2'), orders.c.alias('c2'))
    orders.group_by(orders.a).select(orders.a, orders.b.sum.alias('d'))

    # tab.group_by(tab.key).select(tab.key, tab.value.avg.alias('average'))
    # tab.group_by("key").select("key, value.avg as average")
    result = orders.filter(orders.a.is_not_null & orders.b.is_not_null & orders.c.is_not_null) \
        .select(orders.a.lower_case.alias('a'), orders.b, orders.rowtime) \
        .window(Tumble.over(lit(1).hour).on(orders.rowtime).alias("hourly_window")) \
        .group_by(col('hourly_window'), col('a')) \
        .select(col('a'), col('hourly_window').end.alias('hour'), col('b').avg.alias('avg_billing_amount'))
    """
示例#11
0
    def test_over_window(self):
        t_env = self.t_env
        t = t_env.from_elements([(1, 1, "Hello")], ['a', 'b', 'c'])

        result = t.over_window(
            Over.partition_by(t.c).order_by(t.a).preceding(
                expr.row_interval(2)).following(expr.CURRENT_ROW).alias("w"))

        self.assertRaisesRegex(Py4JJavaError,
                               "Ordering must be defined on a time attribute",
                               result.select,
                               expr.col("b").sum.over(expr.col("w")))
示例#12
0
def word_count(input_path, output_path):
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())
    # write all the data to one file
    t_env.get_config().get_configuration().set_string("parallelism.default",
                                                      "1")

    # define the source
    if input_path is not None:
        t_env.create_temporary_table(
            'source',
            TableDescriptor.for_connector('filesystem').schema(
                Schema.new_builder().column(
                    'word', DataTypes.STRING()).build()).option(
                        'path', input_path).format('csv').build())
        tab = t_env.from_path('source')
    else:
        print("Executing word_count example with default input data set.")
        print("Use --input to specify file input.")
        tab = t_env.from_elements(
            map(lambda i: (i, ), word_count_data),
            DataTypes.ROW([DataTypes.FIELD('line', DataTypes.STRING())]))

    # define the sink
    if output_path is not None:
        t_env.create_temporary_table(
            'sink',
            TableDescriptor.for_connector('filesystem').schema(
                Schema.new_builder().column('word', DataTypes.STRING()).column(
                    'count',
                    DataTypes.BIGINT()).build()).option('path', output_path).
            format(FormatDescriptor.for_format('canal-json').build()).build())
    else:
        print(
            "Printing result to stdout. Use --output to specify output path.")
        t_env.create_temporary_table(
            'sink',
            TableDescriptor.for_connector('print').schema(
                Schema.new_builder().column('word', DataTypes.STRING()).column(
                    'count', DataTypes.BIGINT()).build()).build())

    @udtf(result_types=[DataTypes.STRING()])
    def split(line: Row):
        for s in line[0].split():
            yield Row(s)

    # compute word count
    tab.flat_map(split).alias('word') \
       .group_by(col('word')) \
       .select(col('word'), lit(1).count) \
       .execute_insert('sink') \
       .wait()
示例#13
0
    def test_aggregate_with_pandas_udaf(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT())
            ]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.TINYINT(),
             DataTypes.FLOAT(),
             DataTypes.INT()])
        self.t_env.register_table_sink("Results", table_sink)
        pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.a.max()),
                           result_type=DataTypes.ROW([
                               DataTypes.FIELD("a", DataTypes.FLOAT()),
                               DataTypes.FIELD("b", DataTypes.INT())
                           ]),
                           func_type="pandas")
        t.select(t.a, t.b) \
            .group_by(t.a) \
            .aggregate(pandas_udaf) \
            .select(expr.col("*")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1, 5.0, 1]", "+I[2, 2.0, 2]"])
示例#14
0
    def test_aggregate_with_pandas_udaf(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())]))

        sink_table_ddl = """
        CREATE TABLE Results(a TINYINT, b FLOAT, c INT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.a.max()),
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("a", DataTypes.FLOAT()),
                                DataTypes.FIELD("b", DataTypes.INT())]),
                           func_type="pandas")
        t.select(t.a, t.b) \
            .group_by(t.a) \
            .aggregate(pandas_udaf) \
            .select(expr.col("*")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1, 5.0, 1]", "+I[2, 2.0, 2]"])
示例#15
0
    def test_window_aggregate_with_pandas_udaf(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [(1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
             (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
             (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
             (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
             (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
             (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT()),
                DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))
            ]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.TIMESTAMP(3),
             DataTypes.FLOAT(),
             DataTypes.INT()])
        self.t_env.register_table_sink("Results", table_sink)
        pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()),
                           result_type=DataTypes.ROW([
                               DataTypes.FIELD("a", DataTypes.FLOAT()),
                               DataTypes.FIELD("b", DataTypes.INT())
                           ]),
                           func_type="pandas")
        tumble_window = Tumble.over(expr.lit(1).hours) \
            .on(expr.col("rowtime")) \
            .alias("w")
        t.select(t.b, t.rowtime) \
            .window(tumble_window) \
            .group_by(expr.col("w")) \
            .aggregate(pandas_udaf.alias("d", "e")) \
            .select(expr.col("w").rowtime, expr.col("d"), expr.col("e")) \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "+I[2018-03-11 03:59:59.999, 2.2, 3]",
            "+I[2018-03-11 04:59:59.999, 8.0, 8]"
        ])
示例#16
0
def _get_or_create_java_expression(expr: Union["Expression", str]):
    if isinstance(expr, Expression):
        return expr._j_expr
    elif isinstance(expr, str):
        from pyflink.table.expressions import col
        return col(expr)._j_expr
    else:
        raise TypeError(
            "Invalid argument: expected Expression or string, got {0}.".format(type(expr)))
示例#17
0
    def test_window_aggregate_with_pandas_udaf(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))

        sink_table_ddl = """
        CREATE TABLE Results(a TIMESTAMP(3), b FLOAT, c INT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        print(t.get_schema())
        pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()),
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("a", DataTypes.FLOAT()),
                                DataTypes.FIELD("b", DataTypes.INT())]),
                           func_type="pandas")
        tumble_window = Tumble.over(expr.lit(1).hours) \
            .on(expr.col("rowtime")) \
            .alias("w")
        t.select(t.b, t.rowtime) \
            .window(tumble_window) \
            .group_by(expr.col("w")) \
            .aggregate(pandas_udaf.alias("d", "e")) \
            .select(expr.col("w").rowtime, expr.col("d"), expr.col("e")) \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[2018-03-11 03:59:59.999, 2.2, 3]",
                            "+I[2018-03-11 04:59:59.999, 8.0, 8]"])
示例#18
0
def mixing_use_of_datastream_and_table():
    # use StreamTableEnvironment instead of TableEnvironment when mixing use of table & datastream
    env = StreamExecutionEnvironment.get_execution_environment()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source
    t_env.create_temporary_table(
        'source',
        TableDescriptor.for_connector('datagen').schema(
            Schema.new_builder().column('id', DataTypes.BIGINT()).column(
                'data',
                DataTypes.STRING()).build()).option("number-of-rows",
                                                    "10").build())

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print').schema(
            Schema.new_builder().column('a',
                                        DataTypes.BIGINT()).build()).build())

    @udf(result_type=DataTypes.BIGINT())
    def length(data):
        return len(data)

    # perform table api operations
    table = t_env.from_path("source")
    table = table.select(col('id'), length(col('data')))

    # convert table to datastream and perform datastream api operations
    ds = t_env.to_data_stream(table)
    ds = ds.map(lambda i: i[0] + i[1], output_type=Types.LONG())

    # convert datastream to table and perform table api operations as you want
    table = t_env.from_data_stream(
        ds,
        Schema.new_builder().column("f0", DataTypes.BIGINT()).build())

    # execute
    table.execute_insert('sink') \
         .wait()