Пример #1
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode())

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.execute_sql(sink_ddl)

    elements = [(word, 1) for word in content.split(" ")]
    table = t_env.from_elements(elements, ["word", "count"])
    table.group_by(table.word) \
         .select(table.word, expr.lit(1).count.alias('count')) \
         .execute_insert("Results")
Пример #2
0
def pandas_udf_demo():
    # Init environment
    environment_settings = EnvironmentSettings.new_instance(
    ).use_blink_planner().in_batch_mode().build()
    t_env = TableEnvironment.create(environment_settings=environment_settings)
    t_env.get_config().get_configuration().set_string('parallelism.default',
                                                      '1')

    # Define and register UDF
    add = udf(lambda i, j: i + j,
              [DataTypes.BIGINT(), DataTypes.BIGINT()],
              DataTypes.BIGINT(),
              udf_type="pandas")

    # Register Source
    t_env.execute_sql("""
            CREATE TABLE mySource (
              a BIGINT,
              b BIGINT
            ) WITH (
              'connector' = 'filesystem',
              'format' = 'csv',
              'path' = '/opt/examples/table/input/udf_add_input'
            )
        """)

    # Register Sink
    t_env.execute_sql("""
            CREATE TABLE mySink (
              `sum` BIGINT
            ) WITH (
              'connector' = 'filesystem',
              'format' = 'csv',
              'path' = '/opt/examples/table/output/pandas_udf_add_output'
            )
        """)

    # Query
    t_env.from_path('mySource')\
        .select(add(col('a'), col('b'))) \
        .insert_into('mySink')

    t_env.execute("5-pandas_udf_add")
Пример #3
0
    def test_with_built_in_database_name(self):

        gateway = get_gateway()

        DEFAULT_BUILTIN_DATABASE = gateway.jvm.EnvironmentSettings.DEFAULT_BUILTIN_DATABASE

        builder = EnvironmentSettings.new_instance()

        # test the default behaviour to make sure it is consistent with the python doc
        environment_settings = builder.build()

        self.assertEqual(environment_settings.get_built_in_database_name(),
                         DEFAULT_BUILTIN_DATABASE)

        environment_settings = builder.with_built_in_database_name(
            "my_database").build()

        self.assertEqual(environment_settings.get_built_in_database_name(),
                         "my_database")
Пример #4
0
 def test_to_append_stream(self):
     self.env.set_parallelism(1)
     t_env = StreamTableEnvironment.create(
         self.env,
         environment_settings=EnvironmentSettings.in_streaming_mode())
     table = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hi")],
                                 ["a", "b", "c"])
     new_table = table.select(table.a + 1, table.b + 'flink', table.c)
     ds = t_env.to_append_stream(
         table=new_table,
         type_info=Types.ROW([Types.LONG(),
                              Types.STRING(),
                              Types.STRING()]))
     test_sink = DataStreamTestSinkFunction()
     ds.add_sink(test_sink)
     self.env.execute("test_to_append_stream")
     result = test_sink.get_results(False)
     expected = ['+I[2, Hiflink, Hello]', '+I[3, Helloflink, Hi]']
     self.assertEqual(result, expected)
Пример #5
0
def load(token, day):
    # 获取交易日期维度数据
    pro = ts.pro_api(token)
    df = pro.query('stock_basic', ts_code="000001.SZ", list_status='L',
                   fields='ts_code,symbol,name,area,industry,market,curr_type,list_date,is_hs')

    # 创建flink程序的入口
    env_settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build()
    table_env = BatchTableEnvironment.create(environment_settings=env_settings)

    # 将pandas的dataframe转换成 table,并通过创建视图的方式赋予别称
    table = table_env.from_pandas(df)
    table_env.create_temporary_view("stock_info", table)
    # 声明输出的
    sink_ddl = """
    -- register a MySQL table 'users' in Flink SQL
    create table Results(
        ts_code STRING,
        symbol STRING,
        name  STRING,
        area   STRING,
        industry  STRING,
        market    STRING,
        curr_type STRING,
        list_date  STRING,
        is_hs  STRING
    ) with (
       'connector' = 'jdbc',
       'url' = 'jdbc:mysql://localhost:3306/shares',
       'table-name' = 'dim_stock',
       'username' = 'root',
       'password' = '123456'
    )
    """
    table_env.execute_sql(sink_ddl)

    # 使用jdbc方式需要额外添加java的jar
    table_env.get_config().get_configuration().set_string("pipeline.jars",
                                                          "file:///home/wy/shares/mysql-connector-java-5.1.49.jar;file:///home/wy/shares/flink-connector-jdbc_2.12-1.12.2.jar")

    # mini模式运行的时候需要调用wait 等待 程序运行完成
    table_env.execute_sql("insert into Results select * from stock_info").wait()
Пример #6
0
def main(args):
    func = args[1]
    version = args[2]
    index_name = '_'.join(["performance_pyflink", func, version])
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build()
    t_env = StreamTableEnvironment.create(env, environment_settings=environment_settings)

    t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.size", 300000)
    t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.time", 1000)
    t_env.get_config().get_configuration().set_boolean("pipeline.object-reuse", True)
    t_env.get_config().get_configuration().set_boolean("python.fn-execution.memory.managed", True)

    # t_env.register_table_sink(
    #     "sink",
    #     PrintTableSink(
    #         ["id"],
    #         [DataTypes.INT(False)]))

    @udf(input_types=[DataTypes.INT(False)], result_type=DataTypes.INT(False))
    def inc(x):
        return x + 1

    t_env.register_function("inc", inc)
    t_env.register_java_function("java_inc", "com.alibaba.flink.function.JavaInc")
    register_source(t_env)
    register_sink(t_env, index_name)

    source = t_env.from_path("source")

    if func == 'java':
        table = source.select("java_inc(a) as a")
    else:
        table = source.select("inc(a) as a")

    table.filter("a % 1000000 = 0") \
        .insert_into("sink")

    beg_time = time.time()
    t_env.execute("Python UDF")
    print("PyFlink Python UDF inc() consume time: " + str(time.time() - beg_time))
Пример #7
0
 def test_to_append_stream(self):
     self.env.set_parallelism(1)
     t_env = StreamTableEnvironment.create(
         self.env,
         environment_settings=EnvironmentSettings.new_instance(
         ).use_blink_planner().build())
     table = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hi")],
                                 ["a", "b", "c"])
     new_table = table.select("a + 1, b + 'flink', c")
     ds = t_env.to_append_stream(
         table=new_table,
         type_info=Types.ROW([Types.LONG(),
                              Types.STRING(),
                              Types.STRING()]))
     test_sink = DataStreamTestSinkFunction()
     ds.add_sink(test_sink)
     self.env.execute("test_to_append_stream")
     result = test_sink.get_results(False)
     expected = ['2,Hiflink,Hello', '3,Helloflink,Hi']
     self.assertEqual(result, expected)
Пример #8
0
def main():
    table_env = StreamTableEnvironment.create(
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())

    props = get_application_properties()
    input_property_map = property_map(props, "consumer.config.0")
    output_property_map = property_map(props, "sink.config.0")

    input_table_name = "input_table"
    output_table_name = "output_table"

    table_env.execute_sql("""CREATE TABLE {0} (
                foo BIGINT
            ) WITH (
                'connector' = 'kinesis',
                'stream' = '{1}',
                'aws.region' = '{2}',
                'scan.stream.initpos' = '{3}',
                'format' = 'json',
                'json.timestamp-format.standard' = 'ISO-8601'
            )""".format(input_table_name,
                        input_property_map["input.stream.name"],
                        input_property_map["aws.region"],
                        input_property_map["flink.stream.initpos"]))

    table_env.execute_sql("""CREATE TABLE {0} (
                bar BIGINT
            ) WITH (
                'connector' = 'filesystem',
                'path' = 's3a://{1}/',
                'format' = 'csv',
                'sink.partition-commit.policy.kind' = 'success-file',
                'sink.partition-commit.delay' = '1 min'
            )""".format(output_table_name,
                        output_property_map["output.bucket.name"]))

    table_env.execute_sql("""INSERT INTO {0}
            SELECT foo as bar
            FROM {1}
        """.format(output_table_name, input_table_name))
Пример #9
0
 def test_to_retract_stream(self):
     self.env.set_parallelism(1)
     t_env = StreamTableEnvironment.create(
         self.env,
         environment_settings=EnvironmentSettings.in_streaming_mode())
     table = t_env.from_elements([(1, "Hi", "Hello"), (1, "Hi", "Hello")],
                                 ["a", "b", "c"])
     new_table = table.group_by("c").select("a.sum, c as b")
     ds = t_env.to_retract_stream(table=new_table,
                                  type_info=Types.ROW(
                                      [Types.LONG(),
                                       Types.STRING()]))
     test_sink = DataStreamTestSinkFunction()
     ds.map(lambda x: x).add_sink(test_sink)
     self.env.execute("test_to_retract_stream")
     result = test_sink.get_results(True)
     expected = [
         "(True, Row(f0=1, f1='Hello'))", "(False, Row(f0=1, f1='Hello'))",
         "(True, Row(f0=2, f1='Hello'))"
     ]
     self.assertEqual(result, expected)
Пример #10
0
def load(token):

    # 获取交易日期维度数据
    pro = ts.pro_api(token)
    df = pro.trade_cal(start_date='20210310', end_date='20210410')

    # 创建flink程序的入口
    env_settings = EnvironmentSettings.new_instance().in_batch_mode(
    ).use_blink_planner().build()
    table_env = BatchTableEnvironment.create(environment_settings=env_settings)

    # 将pandas的dataframe转换成 table,并通过创建视图的方式赋予别称
    table = table_env.from_pandas(df)
    table_env.create_temporary_view("tai_date", table)

    # 声明输出的
    sink_ddl = """
    -- register a MySQL table 'users' in Flink SQL
    create table Results(
            exchange STRING,
            cal_date STRING, 
            is_open  bigint 
    ) with (
       'connector' = 'jdbc',
       'url' = 'jdbc:mysql://localhost:3306/shares',
       'table-name' = 'dim_date',
       'username' = 'root',
       'password' = '123456'
    )
    """
    table_env.execute_sql(sink_ddl)

    # 使用jdbc方式需要额外添加java的jar
    table_env.get_config().get_configuration().set_string(
        "pipeline.jars",
        "file:///home/wy/shares/mysql-connector-java-5.1.49.jar;file:///home/wy/shares/flink-connector-jdbc_2.12-1.12.2.jar"
    )

    # mini模式运行的时候需要调用wait 等待 程序运行完成
    table_env.execute_sql("insert into Results select * from tai_date").wait()
def log_processing():
    env = StreamExecutionEnvironment.get_execution_environment()
    env_settings = EnvironmentSettings.Builder().use_blink_planner().build()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env,
                                          environment_settings=env_settings)
    # specify connector and format jars
    t_env.get_config().get_configuration().set_string("pipeline.jars",
                                                      "file://" + FAT_JAR_PATH)

    source_ddl = """
            CREATE TABLE source_table(
                a VARCHAR,
                b INT
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'source_topic',
              'properties.bootstrap.servers' = 'localhost:9092',
              'properties.group.id' = 'test_group',
              'scan.startup.mode' = 'earliest-offset',
              'format' = 'json'
            )
            """

    sink_ddl = """
            CREATE TABLE sink_table(
                a VARCHAR
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'sink_topic',
              'properties.bootstrap.servers' = 'localhost:9092',
              'format' = 'json'
            )
            """

    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)

    t_env.from_path("source_table").select("a").execute_insert(
        "sink_table").wait()
Пример #12
0
def log_processing():
    env_settings = EnvironmentSettings.new_instance().use_blink_planner(
    ).is_streaming_mode().build()
    t_env = TableEnvironment.create(env_settings)
    # specify connector and format jars
    t_env.get_config().get_configuration().set_string(
        "pipeline.jars",
        "file:///my/jar/path/connector.jar;file:///my/jar/path/json.jar")

    source_ddl = """
            CREATE TABLE source_table(
                a VARCHAR,
                b SERIAL
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'topic',
              'properties.bootstrap.servers' = 'localhost:9092',
              'properties.group.id' = 'testGroup'
              'scan.startup.mode' = 'latest-offset',
              'format' = 'json'
            )
            """

    sink_ddl = """
            CREATE TABLE sink_table(
                a VARCHAR
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'topic2',
              'properties.bootstrap.servers' = 'kafka:9092',
              'format' = 'json'
            )
            """

    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)

    t_env.sql_query("SELECT a FROM source_table") \
        .execute_insert("sink_table").wait()
Пример #13
0
def process_json_data_with_udf():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())

    # define the source
    table = t_env.from_elements(elements=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                                schema=['id', 'data'])

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print').schema(
            Schema.new_builder().column('id', DataTypes.BIGINT()).column(
                'data', DataTypes.STRING()).build()).build())

    # update json columns
    @udf(result_type=DataTypes.STRING())
    def update_tel(data):
        json_data = json.loads(data)
        json_data['tel'] += 1
        return json.dumps(json_data)

    table = table.select(table.id, update_tel(table.data))

    # execute
    table.execute_insert('sink') \
         .wait()
Пример #14
0
    def test_explain_with_multi_sinks_with_blink_planner(self):
        t_env = BatchTableEnvironment.create(
            environment_settings=EnvironmentSettings.new_instance(
            ).in_batch_mode().use_blink_planner().build())
        source = t_env.from_elements([(1, "Hi", "Hello"),
                                      (2, "Hello", "Hello")], ["a", "b", "c"])
        field_names = ["a", "b", "c"]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.STRING(),
            DataTypes.STRING()
        ]
        t_env.register_table_sink(
            "sink1", CsvTableSink(field_names, field_types, "path1"))
        t_env.register_table_sink(
            "sink2", CsvTableSink(field_names, field_types, "path2"))

        t_env.sql_update("insert into sink1 select * from %s where a > 100" %
                         source)
        t_env.sql_update("insert into sink2 select * from %s where a < 100" %
                         source)

        actual = t_env.explain(extended=True)
        self.assertIsInstance(actual, (str, unicode))
Пример #15
0
def setup_pyflink_env(gateway: JavaGateway, j_benv: JavaObject, j_senv: JavaObject) \
        -> Tuple[ExecutionEnvironment, BatchTableEnvironment, StreamExecutionEnvironment, StreamTableEnvironment]:
    """
    Setup Py4J for PyFlink, and create Python instances environments.
    """
    from pyflink.dataset import ExecutionEnvironment
    from pyflink.datastream import StreamExecutionEnvironment
    from pyflink.table import BatchTableEnvironment, StreamTableEnvironment
    import pyflink

    # noinspection PyUnresolvedReferences
    pyflink.java_gateway._gateway = gateway
    # noinspection PyUnresolvedReferences
    pyflink.java_gateway.import_flink_view(gateway)

    benv = ExecutionEnvironment(j_benv)
    senv = StreamExecutionEnvironment(j_senv)
    # noinspection PyDeprecation
    btenv = BatchTableEnvironment.create(benv)

    # noinspection PyDeprecation
    stenv_settings = EnvironmentSettings.new_instance().use_old_planner().in_streaming_mode().build()
    stenv = StreamTableEnvironment.create(senv, environment_settings=stenv_settings)
    return benv, btenv, senv, stenv
Пример #16
0
def basic_operations():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())

    # define the source
    table = t_env.from_elements(elements=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                                schema=['id', 'data'])

    right_table = t_env.from_elements(elements=[(1, 18), (2, 30), (3, 25),
                                                (4, 10)],
                                      schema=['id', 'age'])

    table = table.add_columns(
                    col('data').json_value('$.name', DataTypes.STRING()).alias('name'),
                    col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'),
                    col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \
                 .drop_columns(col('data'))
    table.execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | op |                   id |                           name |                            tel |                        country |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | +I |                    1 |                          Flink |                            123 |                        Germany |
    # | +I |                    2 |                          hello |                            135 |                          China |
    # | +I |                    3 |                          world |                            124 |                            USA |
    # | +I |                    4 |                        PyFlink |                             32 |                          China |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+

    # limit the number of outputs
    table.limit(3).execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | op |                   id |                           name |                            tel |                        country |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | +I |                    1 |                          Flink |                            123 |                        Germany |
    # | +I |                    2 |                          hello |                            135 |                          China |
    # | +I |                    3 |                          world |                            124 |                            USA |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+

    # filter
    table.filter(col('id') != 3).execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | op |                   id |                           name |                            tel |                        country |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | +I |                    1 |                          Flink |                            123 |                        Germany |
    # | +I |                    2 |                          hello |                            135 |                          China |
    # | +I |                    4 |                        PyFlink |                             32 |                          China |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+

    # aggregation
    table.group_by(col('country')) \
         .select(col('country'), col('id').count, col('tel').cast(DataTypes.BIGINT()).max) \
         .execute().print()
    # +----+--------------------------------+----------------------+----------------------+
    # | op |                        country |               EXPR$0 |               EXPR$1 |
    # +----+--------------------------------+----------------------+----------------------+
    # | +I |                        Germany |                    1 |                  123 |
    # | +I |                            USA |                    1 |                  124 |
    # | +I |                          China |                    1 |                  135 |
    # | -U |                          China |                    1 |                  135 |
    # | +U |                          China |                    2 |                  135 |
    # +----+--------------------------------+----------------------+----------------------+

    # distinct
    table.select(col('country')).distinct() \
         .execute().print()
    # +----+--------------------------------+
    # | op |                        country |
    # +----+--------------------------------+
    # | +I |                        Germany |
    # | +I |                          China |
    # | +I |                            USA |
    # +----+--------------------------------+

    # join
    # Note that it still doesn't support duplicate column names between the joined tables
    table.join(right_table.rename_columns(col('id').alias('r_id')), col('id') == col('r_id')) \
         .execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+
    # | op |                   id |                           name |                            tel |                        country |                 r_id |                  age |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+
    # | +I |                    4 |                        PyFlink |                             32 |                          China |                    4 |                   10 |
    # | +I |                    1 |                          Flink |                            123 |                        Germany |                    1 |                   18 |
    # | +I |                    2 |                          hello |                            135 |                          China |                    2 |                   30 |
    # | +I |                    3 |                          world |                            124 |                            USA |                    3 |                   25 |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+

    # join lateral
    @udtf(result_types=[DataTypes.STRING()])
    def split(r: Row):
        for s in r.name.split("i"):
            yield s

    table.join_lateral(split.alias('a')) \
         .execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+
    # | op |                   id |                           name |                            tel |                        country |                              a |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+
    # | +I |                    1 |                          Flink |                            123 |                        Germany |                             Fl |
    # | +I |                    1 |                          Flink |                            123 |                        Germany |                             nk |
    # | +I |                    2 |                          hello |                            135 |                          China |                          hello |
    # | +I |                    3 |                          world |                            124 |                            USA |                          world |
    # | +I |                    4 |                        PyFlink |                             32 |                          China |                           PyFl |
    # | +I |                    4 |                        PyFlink |                             32 |                          China |                             nk |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+

    # show schema
    table.print_schema()
    # (
    #   `id` BIGINT,
    #   `name` STRING,
    #   `tel` STRING,
    #   `country` STRING
    # )

    # show execute plan
    print(table.join_lateral(split.alias('a')).explain())
Пример #17
0
def log_processing():
    env_settings = EnvironmentSettings.new_instance().in_streaming_mode(
    ).use_blink_planner().build()
    t_env = StreamTableEnvironment.create(environment_settings=env_settings)
    # specify connector and format jars
    t_env.get_config().get_configuration().set_string(
        "pipeline.jars",
        "file:///Users/liuhongwei/.m2/repository/org/apache/flink/flink-connector-kafka_2.11/1.12.0/flink-connector-kafka_2.11-1.12.0.jar;file:///Users/liuhongwei/.m2/repository/net/sf/json-lib/json-lib/2.3/json-lib-2.3-jdk15.jar;file:///Users/liuhongwei/.m2/repository/org/apache/kafka/kafka-clients/2.4.1/kafka-clients-2.4.1.jar"
    )

    source_ddl = """
            CREATE TABLE source_table(
                token VARCHAR,
                stime BIGINT,
                appKey VARCHAR,
                user_action_time AS PROCTIME()
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'markTopic',
              'properties.bootstrap.servers' = 'slavenode164.data.test.ds:9092,slavenode165.data.test.ds:9092,slavenode166.data.test.ds:9092',
              'properties.group.id' = 'test_3',
              'scan.startup.mode' = 'earliest-offset',
              'format' = 'json'
            )
            """

    sink_ddl = """
            CREATE TABLE sink_table(
                token VARCHAR,
                appKey VARCHAR,
                stime TIMESTAMP(3) NOT NULL,
                nums BIGINT NOT NULL
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'markTopic1',
              'properties.bootstrap.servers' = 'slavenode164.data.test.ds:9092,slavenode165.data.test.ds:9092,slavenode166.data.test.ds:9092',
              'format' = 'json'
            )
            """

    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)
    query_sql = """
        SELECT 
          token,
          appKey,
          TUMBLE_START(user_action_time, INTERVAL '5' MINUTE) as stime, 
          COUNT(token) as nums 
        FROM source_table 
        WHERE appKey = 'YSHAppAndroidIOSH5'
        GROUP BY 
          token,
          appKey,
          TUMBLE(user_action_time, INTERVAL '5' MINUTE)
    """
    # t_env.sql_query(query_sql) \
    #     .execute_insert("sink_table").wait()
    source_t = t_env.from_path("source_table")
    result = source_t.filter(source_t.appKey == "YSHAppAndroidIOSH5") \
      .window(Slide.over(lit(1).days) \
        .every(lit(1).minutes) \
        .on(source_t.user_action_time).alias("w")) \
        .group_by(source_t.token, source_t.appKey, col("w")) \
          .select(source_t.token, source_t.appKey, col("w").start.alias("stime"), source_t.token.count.alias("nums"))

    result.execute_insert("sink_table").wait()
Пример #18
0
 def setUp(self):
     super(PyFlinkBlinkBatchTableTestCase, self).setUp()
     self.t_env = BatchTableEnvironment.create(
         environment_settings=EnvironmentSettings.new_instance(
         ).in_batch_mode().use_blink_planner().build())
     self.t_env._j_tenv.getPlanner().getExecEnv().setParallelism(2)
Пример #19
0
def kafka_to_mysql():
    """
    从Kafka Source读取Json数据,然后导入到Mysql。{"msg": "welcome flink users..."}
    """
    settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
    env = StreamExecutionEnvironment.get_execution_environment()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=settings)
    t_env.get_config().get_configuration().set_boolean("python.fn-execution.memory.managed", True)

    #JARS_DIR=/Users/jincheng.sunjc/work/PlaygroundEnv/myJars/
    #wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/org/apache/flink/flink-json/1.11.1/flink-json-1.11.1.jar; \
    #wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka-0.11_2.11/1.11.1/flink-sql-connector-kafka-0.11_2.11-1.11.1.jar; \
    #wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-jdbc_2.11/1.11.1/flink-connector-jdbc_2.11-1.11.1.jar; \
    #wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/mysql/mysql-connector-java/5.1.40/mysql-connector-java-5.1.40.jar; \
    #wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/org/apache/flink/flink-table-common/1.11.1/flink-table-common-1.11.1.jar ; \

    # 添加依赖
    base_dir = "file:///Users/jincheng.sunjc/work/PlaygroundEnv/myJars/"

    kafka_jar = f"{base_dir}flink-sql-connector-kafka-0.11_2.11-1.11.1.jar"
    jdbc_jar = f"{base_dir}flink-connector-jdbc_2.11-1.11.1.jar"
    mysql_jar = f"{base_dir}mysql-connector-java-5.1.40.jar"
    json_format_jar = f"{base_dir}flink-json-1.11.1.jar"
    table_common_jar = f"{base_dir}flink-table-common-1.11.1.jar"

    jar_seq = [kafka_jar, jdbc_jar, mysql_jar, json_format_jar, table_common_jar]
    jars = ";".join(jar_seq)

    t_env.get_config().get_configuration().set_string("pipeline.jars",jars)

    source_ddl = """
                    CREATE TABLE kafka_source (
                        msg STRING
                    ) WITH (
                        'connector' = 'kafka-0.11',
                        'topic' = 'cdn-log',
                        'properties.bootstrap.servers' = 'localhost:9092',
                        'format' = 'json',
                        'scan.startup.mode' = 'latest-offset'
                    )
                    """

    sink_ddl = """
                  CREATE TABLE mysql_sink (
                    msg STRING 
                ) WITH (
                   'connector' = 'jdbc',
                   'url' = 'jdbc:mysql://localhost:3306/flinkdb?characterEncoding=utf-8&useSSL=false',
                   'table-name' = 'cdn_log',
                   'username' = 'root',
                   'password' = '123456',
                   'sink.buffer-flush.max-rows' = '1'
                )
        """

    # 注册source和sink
    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)

    # 数据提取
    tab = t_env.from_path("kafka_source")
    # 这里我们暂时先使用 标注了 deprecated 的API, 因为新的异步提交测试有待改进...
    tab.insert_into("mysql_sink")
    # 执行作业
    t_env.execute("kafka_to_mysql")
Пример #20
0
def group_by_window_agg_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/table_group_by_window_agg_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("user")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
        ) \
        .with_format(  # declare a format for this system
            Json()
            .fail_on_missing_field(True)
            .json_schema(
                "{"
                "  type: 'object',"
                "  properties: {"
                "    a: {"
                "      type: 'string'"
                "    },"
                "    b: {"
                "      type: 'string'"
                "    },"
                "    c: {"
                "      type: 'string'"
                "    },"
                "    time: {"
                "      type: 'string',"
                "      format: 'date-time'"
                "    }"
                "  }"
                "}"
             )
         ) \
        .with_schema(  # declare the schema of the table
             Schema()
             .field("rowtime", DataTypes.TIMESTAMP())
             .rowtime(
                Rowtime()
                .timestamps_from_field("time")
                .watermarks_periodic_bounded(60000))
             .field("a", DataTypes.STRING())
             .field("b", DataTypes.STRING())
             .field("c", DataTypes.STRING())
         ) \
        .in_append_mode() \
        .register_table_source("source")

    st_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b"],
            [DataTypes.STRING(), DataTypes.STRING()], result_file))

    st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \
        .group_by("w, a") \
        .select("a, max(b)").insert_into("result")

    st_env.execute("group by window agg streaming")
Пример #21
0
def word_count():
    environment_settings = EnvironmentSettings.new_instance().in_batch_mode(
    ).use_blink_planner().build()
    t_env = BatchTableEnvironment.create(
        environment_settings=environment_settings)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    # we should set the Python verison here if `Python` not point
    t_env.get_config().set_python_executable("python3")

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("city", DataTypes.STRING())
                     .field("sales_volume", DataTypes.BIGINT())
                     .field("sales", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("city", DataTypes.STRING())
                     .field("sales_volume", DataTypes.BIGINT())
                     .field("sales", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    @udf(input_types=DataTypes.STRING(),
         result_type=DataTypes.ARRAY(DataTypes.STRING()))
    def split(input_str: str):
        return input_str.split(",")

    @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()),
                      DataTypes.INT()],
         result_type=DataTypes.STRING())
    def get(arr, index):
        return arr[index]

    t_env.register_function("split", split)
    t_env.register_function("get", get)

    t_env.get_config().get_configuration().set_string("parallelism.default",
                                                      "1")

    data = [
        ("iPhone 11,30,5499,Beijing", ), ("iPhone 11 Pro,20,8699,Guangzhou", ),
        ("MacBook Pro,10,9999,Beijing", ), ("AirPods Pro,50,1999,Beijing", ),
        ("MacBook Pro,10,11499,Shanghai", ), ("iPhone 11,30,5999,Shanghai", ),
        ("iPhone 11 Pro,20,9999,Shenzhen", ),
        ("MacBook Pro,10,13899,Hangzhou", ), ("iPhone 11,10,6799,Beijing", ),
        ("MacBook Pro,10,18999,Beijing", ),
        ("iPhone 11 Pro,10,11799,Shenzhen", ),
        ("MacBook Pro,10,22199,Shanghai", ), ("AirPods Pro,40,1999,Shanghai", )
    ]
    t_env.from_elements(data, ["line"]) \
        .select("split(line) as str_array") \
        .select("get(str_array, 3) as city, "
                "get(str_array, 1).cast(LONG) as count, "
                "get(str_array, 2).cast(LONG) as unit_price") \
        .select("city, count, count * unit_price as total_price") \
        .group_by("city") \
        .select("city, "
                "sum(count) as sales_volume, "
                "sum(total_price) as sales") \
        .insert_into("Results")

    t_env.execute("word_count")
Пример #22
0
def test():
    # 1. create a TableEnvironment
    #env_settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
    #table_env = StreamTableEnvironment.create(environment_settings=env_settings)
    env_settings = EnvironmentSettings.new_instance().in_batch_mode(
    ).use_blink_planner().build()
    table_env = BatchTableEnvironment.create(environment_settings=env_settings)

    # 2. create source Table
    table_env.execute_sql("""
        CREATE TABLE source_table (
            Region VARCHAR,
            Country	VARCHAR,
            Item_Type VARCHAR,
            Sales_Channel VARCHAR,
            Order_Priority VARCHAR,
            Order_Date VARCHAR,
            Order_ID VARCHAR,
            Ship_Date VARCHAR,
            Units_Sold VARCHAR,
            Unit_Price VARCHAR,
            Unit_Cost VARCHAR,
            Total_Revenue VARCHAR,
            Total_Cost VARCHAR,
            Total_Profit VARCHAR
        ) WITH (
            'connector' = 'filesystem',
            'path' = '/tmp/data/5m_Sales_Records.csv',
            'format' = 'csv'
        )
    """)

    table_env.execute_sql("""
        CREATE TABLE sink_table (
            Region VARCHAR,
            Country	VARCHAR,
            Item_Type VARCHAR,
            Sales_Channel VARCHAR,
            Order_Priority VARCHAR,
            Order_Date VARCHAR,
            Order_ID VARCHAR,
            Ship_Date VARCHAR,
            Units_Sold VARCHAR,
            Unit_Price VARCHAR,
            Unit_Cost VARCHAR,
            Total_Revenue VARCHAR,
            Total_Cost VARCHAR,
            Total_Profit VARCHAR
        )
          WITH (
            'connector' = 'filesystem',
            'path' = '/tmp/data/xxx_Sales_Records.csv',
            'format' = 'csv'
        )
    """)

    @udf(input_types=DataTypes.STRING(),
         result_type=DataTypes.ARRAY(DataTypes.STRING()))
    def split(input_str: str):
        return input_str.split(",")

    @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()),
                      DataTypes.INT()],
         result_type=DataTypes.STRING())
    def get(arr, index):
        return arr[index]

    table_env.register_function("split", split)
    table_env.register_function("get", get)

    table_env.sql_query("SELECT * FROM source_table order by Region") \
            .execute_insert("sink_table").wait()
Пример #23
0
def pv_uv_demo():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("user_behavior")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
        ) \
        .with_format(  # declare a format for this system
            Json()
            .fail_on_missing_field(True)
            .json_schema(
                "{"
                "  type: 'object',"
                "  properties: {"
                "    user_id: {"
                "      type: 'string'"
                "    },"
                "    item_id: {"
                "      type: 'string'"
                "    },"
                "    category_id: {"
                "      type: 'string'"
                "    },"
                "    behavior: {"
                "      type: 'string'"
                "    },"
                "    ts: {"
                "      type: 'string',"
                "      format: 'date-time'"
                "    }"
                "  }"
                "}"
            )
        ) \
        .with_schema(  # declare the schema of the table
            Schema()
            .field("user_id", DataTypes.STRING())
            .field("item_id", DataTypes.STRING())
            .field("category_id", DataTypes.STRING())
            .field("behavior", DataTypes.STRING())
            .field("rowtime", DataTypes.TIMESTAMP())
            .rowtime(
                Rowtime()
                .timestamps_from_field("ts")
                .watermarks_periodic_bounded(60000))
         ) \
        .in_append_mode() \
        .register_table_source("source")

    # use custom retract sink connector
    custom_connector = CustomConnectorDescriptor('jdbc', 1, False) \
        .property("connector.driver", "org.apache.derby.jdbc.ClientDriver") \
        .property("connector.url", "jdbc:derby://localhost:1527/firstdb") \
        .property("connector.table", "pv_uv_table") \
        .property("connector.write.flush.max-rows", "1")
    st_env.connect(custom_connector) \
        .with_schema(
        Schema()
            .field("startTime", DataTypes.TIMESTAMP())
            .field("endTime", DataTypes.TIMESTAMP())
            .field("pv", DataTypes.BIGINT())
            .field("uv", DataTypes.BIGINT())
    ).register_table_sink("sink")

    st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \
        .group_by("w") \
        .select("w.start as startTime, w.end as endTime, COUNT(1) as pv, user_id.count.distinct as uv").insert_into("sink")

    st_env.execute("table pv uv")
Пример #24
0
def log_processing():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    env_settings = EnvironmentSettings.Builder().use_blink_planner().build()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env,
                                          environment_settings=env_settings)
    t_env.get_config().get_configuration().set_boolean(
        "python.fn-execution.memory.managed", True)

    source_ddl = """
            CREATE TABLE payment_msg(
                createTime VARCHAR,
                rt as TO_TIMESTAMP(createTime),
                orderId BIGINT,
                payAmount DOUBLE,
                payPlatform INT,
                provinceId INT,
                WATERMARK FOR rt as rt - INTERVAL '2' SECOND
            ) WITH (
              'connector' = 'kafka-0.11',
              'topic' = 'payment_msg',
              'properties.bootstrap.servers' = 'kafka:9092',
              'scan.startup.mode' = 'latest-offset',
              'format' = 'json'
            )
            """

    es_sink_ddl = """
            CREATE TABLE es_sink (
            province VARCHAR,
            pay_amount DOUBLE,
            rowtime TIMESTAMP(3)
            ) with (
                'connector.type' = 'elasticsearch',
                'connector.version' = '7',
                'connector.hosts' = 'http://elasticsearch:9200',
                'connector.index' = 'platform_pay_amount_1',
                'connector.document-type' = 'payment',
                'update-mode' = 'append',
                'connector.flush-on-checkpoint' = 'true',
                'connector.key-delimiter' = '$',
                'connector.key-null-literal' = 'n/a',
                'connector.bulk-flush.max-size' = '42mb',
                'connector.bulk-flush.max-actions' = '32',
                'connector.bulk-flush.interval' = '1000',
                'connector.bulk-flush.backoff.delay' = '1000',
                'format.type' = 'json'
            )
    """

    t_env.sql_update(source_ddl)
    t_env.sql_update(es_sink_ddl)

    t_env.register_function('province_id_to_name', province_id_to_name)

    query = """
    select province_id_to_name(provinceId) as province, sum(payAmount) as pay_amount, tumble_start(rt, interval '5' seconds) as rowtime
    from payment_msg
    group by tumble(rt, interval '5' seconds), provinceId
    """

    t_env.sql_query(query).insert_into("es_sink")

    t_env.execute("payment_demo")
Пример #25
0
 def setUp(self):
     super(StreamDependencyTests, self).setUp()
     self.st_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())
     self.st_env._execution_mode = "loopback"
Пример #26
0
def distinct_agg_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("user")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
        ) \
        .with_format(  # declare a format for this system
            Json()
            .fail_on_missing_field(True)
            .json_schema(
                "{"
                "  type: 'object',"
                "  properties: {"
                "    a: {"
                "      type: 'string'"
                "    },"
                "    b: {"
                "      type: 'string'"
                "    },"
                "    c: {"
                "      type: 'string'"
                "    },"
                "    time: {"
                "      type: 'string',"
                "      format: 'date-time'"
                "    }"
                "  }"
                "}"
             )
         ) \
        .with_schema(  # declare the schema of the table
             Schema()
             .field("rowtime", DataTypes.TIMESTAMP())
             .rowtime(
                Rowtime()
                .timestamps_from_field("time")
                .watermarks_periodic_bounded(60000))
             .field("a", DataTypes.STRING())
             .field("b", DataTypes.STRING())
             .field("c", DataTypes.STRING())
         ) \
        .in_append_mode() \
        .register_table_source("Orders")
    st_env.connect(
        Elasticsearch()
        .version("6")
        .host("localhost", 9200, "http")
        .index("distinct_agg_streaming")
        .document_type('pyflink')
        .key_delimiter("_")
        .key_null_literal("null")
        .failure_handler_ignore()
        .disable_flush_on_checkpoint()
        .bulk_flush_max_actions(2)
        .bulk_flush_max_size("1 mb")
        .bulk_flush_interval(5000)
        ) \
        .with_schema(
            Schema()
            .field("a", DataTypes.STRING())
            .field("b", DataTypes.STRING())
        ) \
        .with_format(
           Json()
           .derive_schema()
        ) \
        .in_upsert_mode() \
        .register_table_sink("result")
    orders = st_env.scan("Orders")
    result = orders.window(Tumble.over("30.minutes").on("rowtime").alias("w")) \
        .group_by("a, w").select("a, b.max.distinct as d")
    result.insert_into("result")
    st_env.execute("distinct agg streaming")
from pyflink.table import EnvironmentSettings, StreamTableEnvironment

# 1. create a TableEnvironment
env_settings = EnvironmentSettings.new_instance().in_streaming_mode(
).use_blink_planner().build()
table_env = StreamTableEnvironment.create(environment_settings=env_settings)

# 2. create source Table
table_env.execute_sql("""
    CREATE TABLE tweets(
        id INT,
        text STRING
        user_screen_name STRING
        followers_count INT
        retweet_count INT
        favorite_count INT
    ) WITH (
        'connector' = 'kafka',
         'topic' = 'tweets_data',
        'properties.bootstrap.servers' = 'localhost:2181',
    )
""")

# 3. create sink Table
table_env.execute_sql("""
    CREATE TABLE print (
        id INT,
        text STRING
        user_screen_name STRING
        followers_count INT
        retweet_count INT
Пример #28
0
def column_operations():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())

    # define the source
    table = t_env.from_elements(elements=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                                schema=['id', 'data'])

    # add columns
    table = table.add_columns(
        col('data').json_value('$.name', DataTypes.STRING()).alias('name'),
        col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'),
        col('data').json_value('$.addr.country',
                               DataTypes.STRING()).alias('country'))

    table.execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+
    # | op |                   id |                           data |                           name |                            tel |                        country |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+
    # | +I |                    1 | {"name": "Flink", "tel": 12... |                          Flink |                            123 |                        Germany |
    # | +I |                    2 | {"name": "hello", "tel": 13... |                          hello |                            135 |                          China |
    # | +I |                    3 | {"name": "world", "tel": 12... |                          world |                            124 |                            USA |
    # | +I |                    4 | {"name": "PyFlink", "tel": ... |                        PyFlink |                             32 |                          China |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+

    # drop columns
    table = table.drop_columns(col('data'))
    table.execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | op |                   id |                           name |                            tel |                        country |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | +I |                    1 |                          Flink |                            123 |                        Germany |
    # | +I |                    2 |                          hello |                            135 |                          China |
    # | +I |                    3 |                          world |                            124 |                            USA |
    # | +I |                    4 |                        PyFlink |                             32 |                          China |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+

    # rename columns
    table = table.rename_columns(col('tel').alias('telephone'))
    table.execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | op |                   id |                           name |                      telephone |                        country |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | +I |                    1 |                          Flink |                            123 |                        Germany |
    # | +I |                    2 |                          hello |                            135 |                          China |
    # | +I |                    3 |                          world |                            124 |                            USA |
    # | +I |                    4 |                        PyFlink |                             32 |                          China |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+

    # replace columns
    table = table.add_or_replace_columns(
        concat(col('id').cast(DataTypes.STRING()), '_',
               col('name')).alias('id'))
    table.execute().print()
Пример #29
0
content = (
    'line Licensed to the Apache Software Foundation ASF under one '
    'line or more contributor license agreements See the NOTICE file '
    'line distributed with this work for additional information '
    'line regarding copyright ownership The ASF licenses this file '
    'to you under the Apache License Version the '
    'License you may not use this file except in compliance '
    'with the License'
)

env = StreamExecutionEnvironment.get_execution_environment()
t_env = StreamTableEnvironment.create(
    env,
    environment_settings = EnvironmentSettings.new_instance()
    .use_blink_planner()
    .build(),
)

result_path = '/notebooks/output.csv'

print('Results directory:', result_path)

t_env.connect(FileSystem().path(result_path)).with_format(
    OldCsv()
    .field_delimiter(',')
    .field('word', DataTypes.STRING())
    .field('count', DataTypes.BIGINT())
).with_schema(
    Schema()
    .field('word', DataTypes.STRING())
Пример #30
0
def row_operations():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())

    # define the source
    table = t_env.from_elements(elements=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "China", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                                schema=['id', 'data'])

    # map operation
    @udf(result_type=DataTypes.ROW([
        DataTypes.FIELD("id", DataTypes.BIGINT()),
        DataTypes.FIELD("country", DataTypes.STRING())
    ]))
    def extract_country(input_row: Row):
        data = json.loads(input_row.data)
        return Row(input_row.id, data['addr']['country'])

    table.map(extract_country) \
         .execute().print()
    # +----+----------------------+--------------------------------+
    # | op |                   id |                        country |
    # +----+----------------------+--------------------------------+
    # | +I |                    1 |                        Germany |
    # | +I |                    2 |                          China |
    # | +I |                    3 |                          China |
    # | +I |                    4 |                          China |
    # +----+----------------------+--------------------------------+

    # flat_map operation
    @udtf(result_types=[DataTypes.BIGINT(), DataTypes.STRING()])
    def extract_city(input_row: Row):
        data = json.loads(input_row.data)
        yield input_row.id, data['addr']['city']

    table.flat_map(extract_city) \
         .execute().print()

    # +----+----------------------+--------------------------------+
    # | op |                   f0 |                             f1 |
    # +----+----------------------+--------------------------------+
    # | +I |                    1 |                         Berlin |
    # | +I |                    2 |                       Shanghai |
    # | +I |                    3 |                        NewYork |
    # | +I |                    4 |                       Hangzhou |
    # +----+----------------------+--------------------------------+

    # aggregate operation
    class CountAndSumAggregateFunction(AggregateFunction):
        def get_value(self, accumulator):
            return Row(accumulator[0], accumulator[1])

        def create_accumulator(self):
            return Row(0, 0)

        def accumulate(self, accumulator, input_row):
            accumulator[0] += 1
            accumulator[1] += int(input_row.tel)

        def retract(self, accumulator, input_row):
            accumulator[0] -= 1
            accumulator[1] -= int(input_row.tel)

        def merge(self, accumulator, accumulators):
            for other_acc in accumulators:
                accumulator[0] += other_acc[0]
                accumulator[1] += other_acc[1]

        def get_accumulator_type(self):
            return DataTypes.ROW([
                DataTypes.FIELD("cnt", DataTypes.BIGINT()),
                DataTypes.FIELD("sum", DataTypes.BIGINT())
            ])

        def get_result_type(self):
            return DataTypes.ROW([
                DataTypes.FIELD("cnt", DataTypes.BIGINT()),
                DataTypes.FIELD("sum", DataTypes.BIGINT())
            ])

    count_sum = udaf(CountAndSumAggregateFunction())
    table.add_columns(
            col('data').json_value('$.name', DataTypes.STRING()).alias('name'),
            col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'),
            col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \
         .group_by(col('country')) \
         .aggregate(count_sum.alias("cnt", "sum")) \
         .select(col('country'), col('cnt'), col('sum')) \
         .execute().print()

    # +----+--------------------------------+----------------------+----------------------+
    # | op |                        country |                  cnt |                  sum |
    # +----+--------------------------------+----------------------+----------------------+
    # | +I |                          China |                    3 |                  291 |
    # | +I |                        Germany |                    1 |                  123 |
    # +----+--------------------------------+----------------------+----------------------+

    # flat_aggregate operation
    class Top2(TableAggregateFunction):
        def emit_value(self, accumulator):
            for v in accumulator:
                if v:
                    yield Row(v)

        def create_accumulator(self):
            return [None, None]

        def accumulate(self, accumulator, input_row):
            tel = int(input_row.tel)
            if accumulator[0] is None or tel > accumulator[0]:
                accumulator[1] = accumulator[0]
                accumulator[0] = tel
            elif accumulator[1] is None or tel > accumulator[1]:
                accumulator[1] = tel

        def get_accumulator_type(self):
            return DataTypes.ARRAY(DataTypes.BIGINT())

        def get_result_type(self):
            return DataTypes.ROW([DataTypes.FIELD("tel", DataTypes.BIGINT())])

    top2 = udtaf(Top2())
    table.add_columns(
            col('data').json_value('$.name', DataTypes.STRING()).alias('name'),
            col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'),
            col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \
        .group_by(col('country')) \
        .flat_aggregate(top2) \
        .select(col('country'), col('tel')) \
        .execute().print()