Exemplo n.º 1
0
def main():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.add_jars("file:///app/src/kafka-clients-2.8.0.jar")
    env.add_jars("file:///app/src/flink-connector-kafka_2.12-1.12.3.jar")
    env.set_parallelism(1)
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)

    env.enable_checkpointing(60000, CheckpointingMode.EXACTLY_ONCE)
    config = env.get_checkpoint_config()
    config.enable_externalized_checkpoints(
        ExternalizedCheckpointCleanup.DELETE_ON_CANCELLATION)

    st_env = StreamTableEnvironment.create(
        env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())

    print("register kafka source")
    register_kafka_source(st_env)
    print("register transaction sinks")
    register_transactions_sink_into_csv(st_env)


    st_env.from_path("source_tbl") \
       .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
       .group_by(col("w")) \
       .select("""count(message) as total,
                   w.end as end_time
                  """) \
       .insert_into("total_sink")

    st_env.from_path("source_tbl") \
       .where("message = 'dolorem'") \
       .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
       .group_by(col("w")) \
       .select("""
                   count(message) as total,
                   w.end as end_time
                  """) \
       .insert_into("grep_sink")

    st_env.from_path("source_tbl") \
        .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
        .group_by(col("w"), col("message")) \
        .select("""
                    count(message) as total,
                    message,
                    w.end as end_time
                   """) \
        .insert_into("topk_sink")

    st_env.execute("app")
Exemplo n.º 2
0
    def test_sliding_group_window_over_proctime(self):
        self.t_env.get_config().set("parallelism.default", "1")
        from pyflink.table.window import Slide
        self.t_env.register_function("mean_udaf", mean_udaf)

        source_table = """
            create table source_table(
                a INT,
                proctime as PROCTIME()
            ) with(
                'connector' = 'datagen',
                'rows-per-second' = '1',
                'fields.a.kind' = 'sequence',
                'fields.a.start' = '1',
                'fields.a.end' = '10'
            )
        """
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")
        iterator = t.select(t.a, t.proctime) \
            .window(Slide.over(lit(1).seconds)
                    .every(lit(1).seconds)
                    .on(t.proctime)
                    .alias("w")) \
            .group_by(t.a, col("w")) \
            .select(mean_udaf(t.a).alias("b"), col("w").start).execute().collect()
        result = [i for i in iterator]
        # if the WindowAssigner.isEventTime() does not return false,
        # the w.start would be 1970-01-01
        # TODO: After fixing the TimeZone problem of window with processing time (will be fixed in
        # FLIP-162), we should replace it with a more accurate assertion.
        self.assertTrue(result[0][1].year > 1970)
def popular_taxi_vendor():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    t_env.execute_sql(
        create_table_ddl(
            "WATERMARK FOR pickupTime AS pickupTime - INTERVAL '30' SECONDS"))
    taxi_ride = t_env.from_path('TaxiRide')
    popular_rides = taxi_ride.select(taxi_ride.vendorId, taxi_ride.pickupTime) \
        .window(Slide.over('15.minutes').every('5.minutes').on(taxi_ride.pickupTime).alias('w')) \
        .group_by(taxi_ride.vendorId, col('w')) \
        .select(taxi_ride.vendorId, \
                col('w').start.alias('start'), \
                col('w').end.alias('end'), \
                taxi_ride.vendorId.count.alias('cnt'))

    t_env.to_append_stream(
        popular_rides,
        Types.ROW_NAMED(['vendorId', 'start', 'end', 'cnt'], [
            Types.INT(),
            Types.SQL_TIMESTAMP(),
            Types.SQL_TIMESTAMP(),
            Types.LONG()
        ])).print()

    env.execute('Popular-Taxi-Vendor')
Exemplo n.º 4
0
    def test_sliding_group_window_over_count(self):
        self.t_env.get_config().set("parallelism.default", "1")
        # create source file path
        import tempfile
        import os
        tmp_dir = tempfile.gettempdir()
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
            '2,2,3,2018-03-11 03:30:00',
            '3,3,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_sliding_group_window_over_count.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        from pyflink.table.window import Slide
        self.t_env.get_config().set(
            "pipeline.time-characteristic", "ProcessingTime")
        self.t_env.register_function("mean_udaf", mean_udaf)

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                protime as PROCTIME()
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'd'],
            [
                DataTypes.TINYINT(),
                DataTypes.FLOAT()])
        self.t_env.register_table_sink("Results", table_sink)
        t.window(Slide.over(row_interval(2))
                 .every(row_interval(1))
                 .on(t.protime)
                 .alias("w")) \
            .group_by(t.a, t.b, col("w")) \
            .select(t.a, mean_udaf(t.c).alias("b")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1, 2.5]", "+I[1, 5.5]", "+I[2, 2.0]", "+I[3, 2.5]"])
        os.remove(source_path)
Exemplo n.º 5
0
    def test_slide_window(self):
        t = self.t_env.from_elements([(1000, 1, "Hello")], ["a", "b", "c"])
        result = t.window(Slide.over(expr.lit(2).seconds).every(expr.lit(1).seconds).on("a")
                          .alias("w")).group_by(expr.col('w'), expr.col('c')).select(t.b.sum)

        query_operation = result._j_table.getQueryOperation().getChildren().get(0)
        self.assertEqual('[c]', query_operation.getGroupingExpressions().toString())
        self.assertEqual('SlideWindow(field: [a], slide: [1000], size: [2000])',
                         query_operation.getGroupWindow().asSummaryString())
Exemplo n.º 6
0
    def test_slide_group_window_aggregate_function(self):
        import datetime
        from pyflink.table.window import Slide
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e'],
            [
                DataTypes.TINYINT(),
                DataTypes.TIMESTAMP(3),
                DataTypes.TIMESTAMP(3),
                DataTypes.FLOAT(),
                DataTypes.INT()
            ])
        self.t_env.register_table_sink("Results", table_sink)
        self.t_env.register_function("max_add", udaf(MaxAdd(),
                                                     result_type=DataTypes.INT(),
                                                     func_type="pandas"))
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        slide_window = Slide.over(lit(1).hours) \
            .every(lit(30).minutes) \
            .on(col("rowtime")) \
            .alias("w")
        t.window(slide_window) \
            .group_by(t.a, col("w")) \
            .select(t.a,
                    col("w").start,
                    col("w").end,
                    mean_udaf(t.b),
                    call("max_add", t.b, t.c, 1)) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[1, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2.0, 6]",
                            "+I[1, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.5, 7]",
                            "+I[1, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 5.5, 14]",
                            "+I[1, 2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 8.0, 14]",
                            "+I[2, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 1.0, 4]",
                            "+I[2, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.0, 10]",
                            "+I[2, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 3.0, 10]",
                            "+I[3, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.0, 7]",
                            "+I[3, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2.0, 7]"])
Exemplo n.º 7
0
def perform_sliding_window_aggregation(input_table_name):
    # use SQL Table in the Table API
    input_table = table_env.from_path(input_table_name)

    sliding_window_table = (input_table.window(
        Slide.over("10.seconds").every("5.seconds").on("event_time").alias(
            "ten_second_window")
    ).group_by("ticker, ten_second_window").select(
        "ticker, price.min as price, ten_second_window.end as event_time"))

    return sliding_window_table
def perform_sliding_window_aggregation(input_table_name):
    # use SQL Table in the Table API
    input_table = table_env.from_path(input_table_name)

    sliding_window_table = (input_table.window(
        Slide.over("10.seconds").every("5.seconds").on("EVENT_TIME").alias(
            "ten_second_window")
    ).group_by("TICKER, ten_second_window").select(
        "TICKER, PRICE.min as PRICE, ten_second_window.end as EVENT_TIME"))

    return sliding_window_table
Exemplo n.º 9
0
    def test_slide_window(self):
        t = self.t_env.from_elements([(1000, 1, "Hello"), (2000, 2, "Hello"),
                                      (3000, 4, "Hello"), (4000, 8, "Hello")],
                                     ["a", "b", "c"])

        result = t.window(Slide.over("2.seconds").every("1.seconds").on("a").alias("w"))\
            .group_by("w, c").select("b.sum")
        actual = self.collect(result)

        expected = ['1', '3', '6', '12', '8']
        self.assert_equals(actual, expected)
Exemplo n.º 10
0
    def test_sliding_group_window_over_count(self):
        self.t_env.get_config().get_configuration().set_string("parallelism.default", "1")
        # create source file path
        tmp_dir = self.tempdir
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
            '2,2,3,2018-03-11 03:30:00',
            '3,3,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_sliding_group_window_over_count.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        self.t_env.register_function("my_sum", SumAggregateFunction())

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                protime as PROCTIME()
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        from pyflink.testing import source_sink_utils
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'd'],
            [
                DataTypes.TINYINT(),
                DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        t.window(Slide.over("2.rows").every("1.rows").on("protime").alias("w")) \
            .group_by("a, w") \
            .select("a, my_sum(c) as b") \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1, 5]", "+I[1, 11]", "+I[2, 4]", "+I[3, 5]"])
Exemplo n.º 11
0
def sliding_window_demo():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source with watermark definition
    ds = env.from_collection(
        collection=[
            (Instant.of_epoch_milli(1000), 'Alice', 110.1),
            (Instant.of_epoch_milli(4000), 'Bob', 30.2),
            (Instant.of_epoch_milli(3000), 'Alice', 20.0),
            (Instant.of_epoch_milli(2000), 'Bob', 53.1),
            (Instant.of_epoch_milli(5000), 'Alice', 13.1),
            (Instant.of_epoch_milli(3000), 'Bob', 3.1),
            (Instant.of_epoch_milli(7000), 'Bob', 16.1),
            (Instant.of_epoch_milli(10000), 'Alice', 20.1)
        ],
        type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()]))

    table = t_env.from_data_stream(
        ds,
        Schema.new_builder()
              .column_by_expression("ts", "CAST(f0 AS TIMESTAMP(3))")
              .column("f1", DataTypes.STRING())
              .column("f2", DataTypes.FLOAT())
              .watermark("ts", "ts - INTERVAL '3' SECOND")
              .build()
    ).alias("ts", "name", "price")

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print')
                       .schema(Schema.new_builder()
                               .column('name', DataTypes.STRING())
                               .column('total_price', DataTypes.FLOAT())
                               .column('w_start', DataTypes.TIMESTAMP_LTZ())
                               .column('w_end', DataTypes.TIMESTAMP_LTZ())
                               .build())
                       .build())

    # define the sliding window operation
    table = table.window(Slide.over(lit(5).seconds).every(lit(2).seconds).on(col("ts")).alias("w"))\
                 .group_by(col('name'), col('w')) \
                 .select(col('name'), col('price').sum, col("w").start, col("w").end)

    # submit for execution
    table.execute_insert('sink') \
         .wait()
Exemplo n.º 12
0
    def test_slide_window(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.BIGINT(), DataTypes.INT(), DataTypes.STRING()]
        data = [(1000, 1, "Hello"), (2000, 2, "Hello"), (3000, 4, "Hello"), (4000, 8, "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types, field_names)
        t_env = self.t_env
        t_env.register_table_source("Source", csv_source)
        source = t_env.scan("Source")

        result = source.window(Slide.over("2.seconds").every("1.seconds").on("a").alias("w"))\
            .group_by("w, c").select("b.sum")
        actual = self.collect(result)

        expected = ['1', '3', '6', '12', '8']
        self.assert_equals(actual, expected)
Exemplo n.º 13
0
    def test_slide_window(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.LONG, DataTypes.INT, DataTypes.STRING]
        data = [(1000, 1, "Hello"), (2000, 2, "Hello"), (3000, 4, "Hello"), (4000, 8, "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types, field_names)
        t_env = self.t_env
        t_env.register_table_source("Source", csv_source)
        source = t_env.scan("Source")

        result = source.window(Slide.over("2.seconds").every("1.seconds").on("a").alias("w"))\
            .group_by("w, c").select("b.sum")
        actual = self.collect(result)

        expected = ['1', '3', '6', '12', '8']
        self.assert_equals(actual, expected)
Exemplo n.º 14
0
    def test_sliding_group_window_over_count(self):
        self.t_env.get_config().set("parallelism.default", "1")
        # create source file path
        tmp_dir = self.tempdir
        data = [
            '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00',
            '3,3,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_sliding_group_window_over_count.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        self.t_env.register_function("my_sum", SumAggregateFunction())

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                protime as PROCTIME()
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        from pyflink.testing import source_sink_utils
        sink_table_ddl = """
        CREATE TABLE Results(a TINYINT, d BIGINT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        t.window(Slide.over(row_interval(2)).every(row_interval(1)).on(t.protime).alias("w")) \
            .group_by(t.a, col("w")) \
            .select(t.a, call("my_sum", t.c).alias("b")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[1, 5]", "+I[1, 11]", "+I[2, 4]", "+I[3, 5]"])
Exemplo n.º 15
0
def slide_time_window_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_slide_time_window_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result", CsvTableSink(["a"], [DataTypes.INT()], result_file))
    orders = bt_env.scan("Orders")
    result = orders.window(Slide.over("60.minutes").every("10.minutes").on("rowtime").alias("w")) \
        .group_by("w").select("b.sum")
    result.insert_into("result")
    bt_env.execute("slide time window batch")
Exemplo n.º 16
0
def slide_row_window_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime)
    st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/slide_row_window_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("user")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
        ) \
        .with_format(  # declare a format for this system
            Json()
            .fail_on_missing_field(True)
            .json_schema(
                "{"
                "  type: 'object',"
                "  properties: {"
                "    a: {"
                "      type: 'string'"
                "    },"
                "    b: {"
                "      type: 'string'"
                "    },"
                "    c: {"
                "      type: 'string'"
                "    },"
                "    time: {"
                "      type: 'string',"
                "      format: 'date-time'"
                "    }"
                "  }"
                "}"
             )
         ) \
        .with_schema(  # declare the schema of the table
             Schema()
             .field("proctime", DataTypes.TIMESTAMP())
             .proctime()
             .field("a", DataTypes.STRING())
             .field("b", DataTypes.STRING())
             .field("c", DataTypes.STRING())
         ) \
        .in_append_mode() \
        .register_table_source("source")

    st_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b"],
            [DataTypes.STRING(), DataTypes.STRING()], result_file))

    st_env.scan("source").window(Slide.over("2.rows").every("1.rows").on("proctime").alias("w")) \
        .group_by("w, a") \
        .select("a, max(b)").insert_into("result")

    st_env.execute("slide row window streaming")
Exemplo n.º 17
0
def log_processing():
    env_settings = EnvironmentSettings.new_instance().in_streaming_mode(
    ).use_blink_planner().build()
    t_env = StreamTableEnvironment.create(environment_settings=env_settings)
    # specify connector and format jars
    t_env.get_config().get_configuration().set_string(
        "pipeline.jars",
        "file:///Users/liuhongwei/.m2/repository/org/apache/flink/flink-connector-kafka_2.11/1.12.0/flink-connector-kafka_2.11-1.12.0.jar;file:///Users/liuhongwei/.m2/repository/net/sf/json-lib/json-lib/2.3/json-lib-2.3-jdk15.jar;file:///Users/liuhongwei/.m2/repository/org/apache/kafka/kafka-clients/2.4.1/kafka-clients-2.4.1.jar"
    )

    source_ddl = """
            CREATE TABLE source_table(
                token VARCHAR,
                stime BIGINT,
                appKey VARCHAR,
                user_action_time AS PROCTIME()
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'markTopic',
              'properties.bootstrap.servers' = 'slavenode164.data.test.ds:9092,slavenode165.data.test.ds:9092,slavenode166.data.test.ds:9092',
              'properties.group.id' = 'test_3',
              'scan.startup.mode' = 'earliest-offset',
              'format' = 'json'
            )
            """

    sink_ddl = """
            CREATE TABLE sink_table(
                token VARCHAR,
                appKey VARCHAR,
                stime TIMESTAMP(3) NOT NULL,
                nums BIGINT NOT NULL
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'markTopic1',
              'properties.bootstrap.servers' = 'slavenode164.data.test.ds:9092,slavenode165.data.test.ds:9092,slavenode166.data.test.ds:9092',
              'format' = 'json'
            )
            """

    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)
    query_sql = """
        SELECT 
          token,
          appKey,
          TUMBLE_START(user_action_time, INTERVAL '5' MINUTE) as stime, 
          COUNT(token) as nums 
        FROM source_table 
        WHERE appKey = 'YSHAppAndroidIOSH5'
        GROUP BY 
          token,
          appKey,
          TUMBLE(user_action_time, INTERVAL '5' MINUTE)
    """
    # t_env.sql_query(query_sql) \
    #     .execute_insert("sink_table").wait()
    source_t = t_env.from_path("source_table")
    result = source_t.filter(source_t.appKey == "YSHAppAndroidIOSH5") \
      .window(Slide.over(lit(1).days) \
        .every(lit(1).minutes) \
        .on(source_t.user_action_time).alias("w")) \
        .group_by(source_t.token, source_t.appKey, col("w")) \
          .select(source_t.token, source_t.appKey, col("w").start.alias("stime"), source_t.token.count.alias("nums"))

    result.execute_insert("sink_table").wait()
Exemplo n.º 18
0
    def test_sliding_group_window_over_time(self):
        # create source file path
        import tempfile
        import os
        tmp_dir = tempfile.gettempdir()
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
            '2,2,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_sliding_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        from pyflink.table.window import Slide
        self.t_env.get_config().set(
            "pipeline.time-characteristic", "EventTime")
        self.t_env.register_function("mean_udaf", mean_udaf)

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd'],
            [
                DataTypes.TINYINT(),
                DataTypes.TIMESTAMP(3),
                DataTypes.TIMESTAMP(3),
                DataTypes.FLOAT()])
        self.t_env.register_table_sink("Results", table_sink)
        t.window(Slide.over(lit(1).hours)
                 .every(lit(30).minutes)
                 .on(col("rowtime"))
                 .alias("w")) \
            .group_by(t.a, t.b, col("w")) \
            .select(t.a, col("w").start, col("w").end, mean_udaf(t.c).alias("b")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[1, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2.0]",
                            "+I[1, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.5]",
                            "+I[1, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 5.5]",
                            "+I[1, 2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 8.0]",
                            "+I[2, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 1.0]",
                            "+I[2, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.0]",
                            "+I[2, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 3.0]",
                            "+I[3, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.0]",
                            "+I[3, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2.0]"])
        os.remove(source_path)
Exemplo n.º 19
0
    def test_sliding_group_window_over_time(self):
        # create source file path
        tmp_dir = self.tempdir
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:30:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
        ]
        source_path = tmp_dir + '/test_sliding_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        self.t_env.create_temporary_system_function("my_sum",
                                                    SumAggregateFunction())

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c INT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        from pyflink.testing import source_sink_utils
        table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c', 'd'], [
            DataTypes.TINYINT(),
            DataTypes.TIMESTAMP(3),
            DataTypes.TIMESTAMP(3),
            DataTypes.BIGINT()
        ])
        self.t_env.register_table_sink("Results", table_sink)
        t.window(Slide.over(lit(1).hours)
                 .every(lit(30).minutes)
                 .on(t.rowtime)
                 .alias("w")) \
            .group_by(t.a, col("w")) \
            .select(t.a, col("w").start, col("w").end, call("my_sum", t.c).alias("c")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "+I[1, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2]",
            "+I[2, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 1]",
            "+I[3, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2]",
            "+I[1, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 5]",
            "+I[3, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2]",
            "+I[2, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2]",
            "+I[2, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 1]",
            "+I[1, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 11]",
            "+I[1, 2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 8]"
        ])
Exemplo n.º 20
0
    :param name:
    :param sex:
    :param action:
    :param is_delete:
    :return:
    """
    names = name[sex == 'female']
    return names.value_counts().iloc[:10].to_json()


# 注册 udaf
t_env.create_temporary_function('male_click_top10', male_click_top10)
t_env.create_temporary_function('female_click_top10', female_click_top10)

# ########################### 流处理任务 ###########################

slide_window = Slide.over("60.seconds").every("1.seconds").on('ts').alias(
    "w")  # 滑动

# 基于 Table API
t_env.from_path('source') \
    .filter("action = 'click' and is_delete = 1 ") \
    .window(slide_window) \
    .group_by("w") \
    .select("male_click_top10(name, sex) AS male_top10, "
            "female_click_top10(name, sex) AS female_top10, "
            "w.start AS start_time, "
            "w.end AS end_time") \
    .execute_insert("sink")
t_env.execute(source_topic)