Exemplo n.º 1
0
def register_sink(st_env, index_name):
    st_env \
        .connect(
            Elasticsearch()
            .version("7")
            .host("localhost", 9200, "http")
            .index(index_name)
            .document_type('pyflink')
            .key_delimiter("_")
            .key_null_literal("null")
            .failure_handler_ignore()
            .disable_flush_on_checkpoint()
            .bulk_flush_max_actions(42)
            .bulk_flush_max_size("42 mb")
            .bulk_flush_interval(3000)
            .bulk_flush_backoff_constant()
            .bulk_flush_backoff_max_retries(3)
            .bulk_flush_backoff_delay(3000)
            .connection_max_retry_timeout(3)) \
        .with_schema(
            Schema()
            .field("a", DataTypes.INT())) \
        .with_format(
            Json()
            .schema(DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT())]))) \
        .in_upsert_mode() \
        .create_temporary_table("sink")
Exemplo n.º 2
0
def register_rides_source(st_env):
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("Rides")
            .start_from_earliest()
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
            DataTypes.FIELD("rideId", DataTypes.BIGINT()),
            DataTypes.FIELD("isStart", DataTypes.BOOLEAN()),
            DataTypes.FIELD("eventTime", DataTypes.TIMESTAMP()),
            DataTypes.FIELD("lon", DataTypes.FLOAT()),
            DataTypes.FIELD("lat", DataTypes.FLOAT()),
            DataTypes.FIELD("psgCnt", DataTypes.INT()),
            DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("taxiId", DataTypes.BIGINT())
            .field("isStart", DataTypes.BOOLEAN())
            .field("lon", DataTypes.FLOAT())
            .field("lat", DataTypes.FLOAT())
            .field("psgCnt", DataTypes.INT())
            .field("rideTime", DataTypes.TIMESTAMP())
            .rowtime(
            Rowtime()
                .timestamps_from_field("eventTime")
                .watermarks_periodic_bounded(60000))) \
        .in_append_mode() \
        .register_table_source("source")
Exemplo n.º 3
0
def register_rides_source(st_env):
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("universal")
            .topic("Rides")
            .start_from_earliest()
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
            DataTypes.FIELD("rideId", DataTypes.BIGINT()),
            DataTypes.FIELD("isStart", DataTypes.BOOLEAN()),
            DataTypes.FIELD("eventTime", DataTypes.STRING()),
            DataTypes.FIELD("lon", DataTypes.FLOAT()),
            DataTypes.FIELD("lat", DataTypes.FLOAT()),
            DataTypes.FIELD("psgCnt", DataTypes.INT()),
            DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("taxiId", DataTypes.BIGINT())
            .field("isStart", DataTypes.BOOLEAN())
            .field("lon", DataTypes.FLOAT())
            .field("lat", DataTypes.FLOAT())
            .field("psgCnt", DataTypes.INT())
            .field("eventTime", DataTypes.STRING())) \
        .in_append_mode() \
        .create_temporary_table("source")
Exemplo n.º 4
0
def register_rides_sink(st_env):
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("TempResults")
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
            DataTypes.FIELD("rideId", DataTypes.BIGINT()),
            DataTypes.FIELD("taxiId", DataTypes.BIGINT()),
            DataTypes.FIELD("isStart", DataTypes.BOOLEAN()),
            DataTypes.FIELD("lon", DataTypes.FLOAT()),
            DataTypes.FIELD("lat", DataTypes.FLOAT()),
            DataTypes.FIELD("psgCnt", DataTypes.INT()),
            DataTypes.FIELD("rideTime", DataTypes.TIMESTAMP())
        ]))) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("taxiId", DataTypes.BIGINT())
            .field("isStart", DataTypes.BOOLEAN())
            .field("lon", DataTypes.FLOAT())
            .field("lat", DataTypes.FLOAT())
            .field("psgCnt", DataTypes.INT())
            .field("rideTime", DataTypes.TIMESTAMP())) \
        .in_append_mode() \
        .register_table_sink("sink")
Exemplo n.º 5
0
def register_transactions_source(st_env):
    st_env.connect(Kafka()
                   .version("universal")
                   .topic("transactions-data")
                   .start_from_latest()
                   .property("zookeeper.connect", "host.docker.internal:2181")
                   .property("bootstrap.servers", "host.docker.internal:19091")) \
        .with_format(Json()
        .fail_on_missing_field(True)
        .schema(DataTypes.ROW([
        DataTypes.FIELD("customer", DataTypes.STRING()),
        DataTypes.FIELD("transaction_type", DataTypes.STRING()),
        DataTypes.FIELD("online_payment_amount", DataTypes.DOUBLE()),
        DataTypes.FIELD("in_store_payment_amount", DataTypes.DOUBLE()),
        DataTypes.FIELD("lat", DataTypes.DOUBLE()),
        DataTypes.FIELD("lon", DataTypes.DOUBLE()),
        DataTypes.FIELD("transaction_datetime", DataTypes.TIMESTAMP())]))) \
        .with_schema(Schema()
        .field("customer", DataTypes.STRING())
        .field("transaction_type", DataTypes.STRING())
        .field("online_payment_amount", DataTypes.DOUBLE())
        .field("in_store_payment_amount", DataTypes.DOUBLE())
        .field("lat", DataTypes.DOUBLE())
        .field("lon", DataTypes.DOUBLE())
        .field("rowtime", DataTypes.TIMESTAMP())
        .rowtime(
        Rowtime()
            .timestamps_from_field("transaction_datetime")
            .watermarks_periodic_bounded(60000))) \
        .in_append_mode() \
        .register_table_source("source")
Exemplo n.º 6
0
    def test_fail_on_missing_field_true(self):
        json = Json().fail_on_missing_field(True)

        expected = {'format.fail-on-missing-field': 'true',
                    'format.property-version': '1',
                    'format.type': 'json'}

        properties = json.to_properties()
        self.assertEqual(expected, properties)
Exemplo n.º 7
0
    def test_ignore_parse_errors(self):
        json = Json().ignore_parse_errors(True)

        expected = {'format.ignore-parse-errors': 'true',
                    'format.property-version': '1',
                    'format.type': 'json'}

        properties = json.to_properties()
        self.assertEqual(expected, properties)
Exemplo n.º 8
0
    def test_derive_schema(self):
        json = Json().derive_schema()

        expected = {'format.derive-schema': 'true',
                    'format.property-version': '1',
                    'format.type': 'json'}

        properties = json.to_properties()
        self.assertEqual(expected, properties)
Exemplo n.º 9
0
    def test_schema(self):
        json = Json().schema(DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT()),
                                            DataTypes.FIELD("b", DataTypes.STRING())]))

        expected = {'format.schema': 'ROW<a INT, b VARCHAR>',
                    'format.property-version': '1',
                    'format.type': 'json'}

        properties = json.to_properties()
        self.assertEqual(expected, properties)
Exemplo n.º 10
0
def group_by_agg_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    st_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    st_env.connect(
        Elasticsearch()
        .version("6")
        .host("localhost", 9200, "http")
        .index("group_by_agg_streaming")
        .document_type('pyflink')
        .key_delimiter("_")
        .key_null_literal("null")
        .failure_handler_ignore()
        .disable_flush_on_checkpoint()
        .bulk_flush_max_actions(2)
        .bulk_flush_max_size("1 mb")
        .bulk_flush_interval(5000)
        ) \
        .with_schema(
            Schema()
            .field("a", DataTypes.STRING())
            .field("b", DataTypes.STRING())
        ) \
        .with_format(
           Json()
           .derive_schema()
        ) \
        .in_upsert_mode() \
        .register_table_sink("result")

    orders = st_env.scan("Orders")
    groub_by_table = orders.group_by("a").select("a, b.sum as d")
    # Because the schema of index user in elasticsearch is
    # {"a":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
    # "b":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}
    # so we need to cast the type in our demo.
    st_env.register_table("group_table", groub_by_table)
    result = st_env.sql_query("SELECT a, CAST(d AS VARCHAR) from group_table")
    result.insert_into("result")
    st_env.execute("group by agg streaming")
def register_transactions_es_sink(st_env):
    st_env.connect(Elasticsearch()
                   .version("7")
                   .host("localhost", 9200, "http")
                   .index("account-activity")
                   ) \
        .with_schema(Schema()
                     .field("event_id", DataTypes.STRING())
                     .field("account_id", DataTypes.DOUBLE())
                     .field("event_type", DataTypes.STRING())
                     .field("location_country", DataTypes.STRING())
                     .field("event_timestamp", DataTypes.TIMESTAMP(precision=3))) \
        .with_format(Json().derive_schema()).in_upsert_mode().create_temporary_table("sink_elasticsearch")
Exemplo n.º 12
0
def register_transactions_es_sink(st_env):
    st_env.connect(Elasticsearch()
                   .version("6")
                   .host("0.0.0.0", 9200, "http")
                   .index("transactions-supermarket-case")
                   .document_type("usage")) \
        .with_schema(Schema()
                     .field("customer", DataTypes.STRING())
                     .field("count_transactions", DataTypes.STRING())
                     .field("total_online_payment_amount", DataTypes.DOUBLE())
                     .field('total_in_store_payment_amount', DataTypes.DOUBLE())
                     .field("lon", DataTypes.FLOAT())
                     .field("lat", DataTypes.FLOAT())
                     .field('last_transaction_time', DataTypes.STRING())
                     ) \
        .with_format(Json().derive_schema()).in_upsert_mode().register_table_sink("sink_elasticsearch")
Exemplo n.º 13
0
def register_cnt_sink(st_env):
    st_env.connect(
        Elasticsearch()
            .version("6")
            .host("elasticsearch", 9200, "http")
            .index("area-cnts")
            .document_type('areacnt')
            .key_delimiter("$")) \
        .with_schema(
            Schema()
                .field("???", ???)
                .field("???", DataTypes.BIGINT())) \
        .with_format(
           Json()
               .derive_schema()) \
        .in_upsert_mode() \
        .register_table_sink("sink")
Exemplo n.º 14
0
def register_source(st_env):
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("performance_source")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")) \
        .with_format(  # declare a format for this system
            Json()
            .schema(DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT())]))
            .fail_on_missing_field(True)) \
        .with_schema(  # declare the schema of the table
            Schema()
            .field("a", DataTypes.INT())) \
        .in_append_mode() \
        .create_temporary_table("source")
Exemplo n.º 15
0
def register_ride_duration_sink(st_env):
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("TempResults")
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
            DataTypes.FIELD("rideId", DataTypes.BIGINT()),
            DataTypes.FIELD("durationMin", DataTypes.BIGINT())
        ]))) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("durationMin", DataTypes.BIGINT())) \
        .in_append_mode() \
        .register_table_sink("TempResults")
Exemplo n.º 16
0
    def test_json_schema(self):
        json = Json().json_schema("{"
                                  "'title': 'Fruit',"
                                  "'type': 'object',"
                                  "'properties': "
                                  "{"
                                  "'name': {'type': 'string'},"
                                  "'count': {'type': 'integer'},"
                                  "'time': "
                                  "{"
                                  "'description': 'row time',"
                                  "'type': 'string',"
                                  "'format': 'date-time'"
                                  "}"
                                  "},"
                                  "'required': ['name', 'count', 'time']"
                                  "}")

        expected = {
            'format.json-schema':
            "{"
            "'title': 'Fruit',"
            "'type': 'object',"
            "'properties': {"
            "'name': {'type': 'string'},"
            "'count': {'type': 'integer'},"
            "'time': {"
            "'description': 'row time',"
            "'type': 'string',"
            "'format': 'date-time'}"
            "},"
            "'required': ['name', 'count', 'time']}",
            'format.property-version':
            '1',
            'format.type':
            'json'
        }

        properties = json.to_properties()
        self.assertEqual(expected, properties)
def register_transactions_source(st_env):
    st_env.connect(Kafka()
                   .version("universal")
                   .topic("server-logs")
                   .start_from_earliest()
                   .property("zookeeper.connect", "localhost:2181")
                   .property("bootstrap.servers", "localhost:9092")) \
        .with_format(Json()
        .fail_on_missing_field(True)
        .schema(DataTypes.ROW([
        DataTypes.FIELD("event_id", DataTypes.STRING()),
        DataTypes.FIELD("account_id", DataTypes.DOUBLE()),
        DataTypes.FIELD("event_type", DataTypes.DOUBLE()),
        DataTypes.FIELD("location_country", DataTypes.DOUBLE()),
        DataTypes.FIELD("event_timestamp", DataTypes.TIMESTAMP(precision=3))]))) \
        .with_schema(Schema()
        .field("event_id", DataTypes.STRING())
        .field("account_id", DataTypes.DOUBLE())
        .field("event_type", DataTypes.STRING())
        .field("location_country", DataTypes.STRING())
        .field("event_timestamp", DataTypes.TIMESTAMP(precision=3))) \
        .in_append_mode() \
        .create_temporary_table("source")
Exemplo n.º 18
0
from pyflink.table.descriptors import Kafka, Json, OldCsv, Schema, FileSystem

directories = ['/flink/lib']
for directory in directories:
    for jar in glob.glob(os.path.join(directory, '*.jar')):
        sys.path.append(jar)

# from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer11
# from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer09

OldCsv()
print("debug 010")

Kafka()
print("debug 020")
Json()
print("debug 030")


sourcetable = table_env \
    .connect(Kafka()
             .properties({'update-mode': 'append', 'connector.topic': 'machine.data',
                          'connector.properties.zookeeper.connect': 'localhost:2181',
                          'connector.properties.bootstrap.servers.': 'localhost:9092'})) \
    .with_format(Json().
                 json_schema(
    "{type:'object',properties:{thing: {type: 'string'},quantity:{type:'string'},phenomenonTime:{type:'integer'},result:{type:'number'}}}") \
                .fail_on_missing_field(False)) \
    .with_schema(Schema()
                 .field("thing", DataTypes.STRING())
                 .field("quantity", DataTypes.STRING())
Exemplo n.º 19
0
            .use_blink_planner().build())

st_env \
    .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("Rides")
            .start_from_earliest()
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
    .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
                DataTypes.FIELD("rideId", DataTypes.BIGINT()),
                DataTypes.FIELD("isStart", DataTypes.BOOLEAN()),
                DataTypes.FIELD("eventTime", DataTypes.TIMESTAMP()),
                DataTypes.FIELD("lon", DataTypes.FLOAT()),
                DataTypes.FIELD("lat", DataTypes.FLOAT()),
                DataTypes.FIELD("psgCnt", DataTypes.INT()),
                DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \
    .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("taxiId", DataTypes.BIGINT())
            .field("isStart", DataTypes.BOOLEAN())
            .field("lon", DataTypes.FLOAT())
            .field("lat", DataTypes.FLOAT())
            .field("psgCnt", DataTypes.INT())
            .field("rideTime", DataTypes.TIMESTAMP())
            .rowtime(
            Rowtime()
Exemplo n.º 20
0
         .start_from_specific_offset(0,496)
         .property("zookeeper.connect", "6.86.2.170:2181")
         .property("bootstrap.servers", "6.86.2.170:9092")
 ) \
     .with_format(  # declare a format for this system
     Json()
         .fail_on_missing_field(True)
         .json_schema(
         "{"
         "  type: 'object',"
         "  properties: {"
         "    a: {"
         "      type: 'string'"
         "    },"
         "    b: {"
         "      type: 'number'"
         "    },"
         "    c: {"
         "      type: 'string'"
         "    },"
         "    time: {"
         "      type: 'string',"
         "      format: 'date-time'"
         "    }"
         "  }"
         "}"
     )
 ) \
     .with_schema(  # declare the schema of the table
     Schema()
         .field("rowtime", DataTypes.TIMESTAMP())
         .rowtime(
Exemplo n.º 21
0
     Kafka()
         .version("0.11")
         .topic("input")
         .start_from_earliest()
         .property("zookeeper.connect", "zookeeper:2181")
         .property("bootstrap.servers", "kafka:9092")
 ) \
 .with_format(  # declare a format for this system
     Json()
         .fail_on_missing_field(True)
         .json_schema(
         "{"
         "  type: 'object',"
         "  properties: {"
         "    timestamp: {"
         "      type: 'string'"
         "    },"
         "    page: {"
         "      type: 'string'"
         "    }"
         "  }"
         "}"
     )
 ) \
 .with_schema(  # declare the schema of the table
     Schema()
         .field("timestamp", DataTypes.TIMESTAMP()).proctime()
         .field("page", DataTypes.STRING())
 ) \
 .in_append_mode() \
 .register_table_source("ClickEvent Source")
st_env.register_function('predict', predict)


st_env.connect(
    Kafka()
    .version('universal')
    .topic('test')
    .start_from_earliest()
    .property('zookeeper.connect', 'zookeeper:2181')
    .property('bootstrap.servers', 'kafka:9092')
).with_format(
    Json()
    .fail_on_missing_field(True)
    .schema(
        DataTypes.ROW(
            [
                DataTypes.FIELD('datetime', DataTypes.STRING()),
                DataTypes.FIELD('text', DataTypes.STRING()),
            ]
        )
    )
).with_schema(
    Schema()
    .field('datetime', DataTypes.STRING())
    .field('text', DataTypes.STRING())
).in_append_mode().register_table_source(
    'source'
)


result_path = '/notebooks/output-tensorflow.csv'
Exemplo n.º 23
0
def distinct_agg_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("user")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
        ) \
        .with_format(  # declare a format for this system
            Json()
            .fail_on_missing_field(True)
            .json_schema(
                "{"
                "  type: 'object',"
                "  properties: {"
                "    a: {"
                "      type: 'string'"
                "    },"
                "    b: {"
                "      type: 'string'"
                "    },"
                "    c: {"
                "      type: 'string'"
                "    },"
                "    time: {"
                "      type: 'string',"
                "      format: 'date-time'"
                "    }"
                "  }"
                "}"
             )
         ) \
        .with_schema(  # declare the schema of the table
             Schema()
             .field("rowtime", DataTypes.TIMESTAMP())
             .rowtime(
                Rowtime()
                .timestamps_from_field("time")
                .watermarks_periodic_bounded(60000))
             .field("a", DataTypes.STRING())
             .field("b", DataTypes.STRING())
             .field("c", DataTypes.STRING())
         ) \
        .in_append_mode() \
        .register_table_source("Orders")
    st_env.connect(
        Elasticsearch()
        .version("6")
        .host("localhost", 9200, "http")
        .index("distinct_agg_streaming")
        .document_type('pyflink')
        .key_delimiter("_")
        .key_null_literal("null")
        .failure_handler_ignore()
        .disable_flush_on_checkpoint()
        .bulk_flush_max_actions(2)
        .bulk_flush_max_size("1 mb")
        .bulk_flush_interval(5000)
        ) \
        .with_schema(
            Schema()
            .field("a", DataTypes.STRING())
            .field("b", DataTypes.STRING())
        ) \
        .with_format(
           Json()
           .derive_schema()
        ) \
        .in_upsert_mode() \
        .register_table_sink("result")
    orders = st_env.scan("Orders")
    result = orders.window(Tumble.over("30.minutes").on("rowtime").alias("w")) \
        .group_by("a, w").select("a, b.max.distinct as d")
    result.insert_into("result")
    st_env.execute("distinct agg streaming")
Exemplo n.º 24
0
def tumble_time_window_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/tumble_time_window_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("user")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
        ) \
        .with_format(  # declare a format for this system
            Json()
            .fail_on_missing_field(True)
            .json_schema(
                "{"
                "  type: 'object',"
                "  properties: {"
                "    a: {"
                "      type: 'string'"
                "    },"
                "    b: {"
                "      type: 'string'"
                "    },"
                "    c: {"
                "      type: 'string'"
                "    },"
                "    time: {"
                "      type: 'string',"
                "      format: 'date-time'"
                "    }"
                "  }"
                "}"
             )
         ) \
        .with_schema(  # declare the schema of the table
             Schema()
             .field("rowtime", DataTypes.TIMESTAMP())
             .rowtime(
                Rowtime()
                .timestamps_from_field("time")
                .watermarks_periodic_bounded(60000))
             .field("a", DataTypes.STRING())
             .field("b", DataTypes.STRING())
             .field("c", DataTypes.STRING())
         ) \
        .in_append_mode() \
        .register_table_source("source")

    st_env.register_table_sink("result",
                               CsvTableSink(["a", "b"],
                                            [DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file))

    st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \
        .group_by("w, a") \
        .select("a, max(b)").insert_into("result")

    st_env.execute("tumble time window streaming")
Exemplo n.º 25
0
def pv_uv_demo():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("user_behavior")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
        ) \
        .with_format(  # declare a format for this system
            Json()
            .fail_on_missing_field(True)
            .json_schema(
                "{"
                "  type: 'object',"
                "  properties: {"
                "    user_id: {"
                "      type: 'string'"
                "    },"
                "    item_id: {"
                "      type: 'string'"
                "    },"
                "    category_id: {"
                "      type: 'string'"
                "    },"
                "    behavior: {"
                "      type: 'string'"
                "    },"
                "    ts: {"
                "      type: 'string',"
                "      format: 'date-time'"
                "    }"
                "  }"
                "}"
            )
        ) \
        .with_schema(  # declare the schema of the table
            Schema()
            .field("user_id", DataTypes.STRING())
            .field("item_id", DataTypes.STRING())
            .field("category_id", DataTypes.STRING())
            .field("behavior", DataTypes.STRING())
            .field("rowtime", DataTypes.TIMESTAMP())
            .rowtime(
                Rowtime()
                .timestamps_from_field("ts")
                .watermarks_periodic_bounded(60000))
         ) \
        .in_append_mode() \
        .register_table_source("source")

    # use custom retract sink connector
    custom_connector = CustomConnectorDescriptor('jdbc', 1, False) \
        .property("connector.driver", "org.apache.derby.jdbc.ClientDriver") \
        .property("connector.url", "jdbc:derby://localhost:1527/firstdb") \
        .property("connector.table", "pv_uv_table") \
        .property("connector.write.flush.max-rows", "1")
    st_env.connect(custom_connector) \
        .with_schema(
        Schema()
            .field("startTime", DataTypes.TIMESTAMP())
            .field("endTime", DataTypes.TIMESTAMP())
            .field("pv", DataTypes.BIGINT())
            .field("uv", DataTypes.BIGINT())
    ).register_table_sink("sink")

    st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \
        .group_by("w") \
        .select("w.start as startTime, w.end as endTime, COUNT(1) as pv, user_id.count.distinct as uv").insert_into("sink")

    st_env.execute("table pv uv")
Exemplo n.º 26
0
st_env = StreamTableEnvironment.create(s_env,
                                       environment_settings=env_settings)
st_env.get_config().get_configuration().set_string(
    "pipeline.jars",
    "file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka-base_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-jdbc_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-sql-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/kafka-clients-2.1.0.jar"
)

#读kafka
properties = {
    "zookeeper.connect": "nn1.hadoop:2181,nn2.hadoop:2181,s1.hadoop:2181",
    "bootstrap.servers": "nn1.hadoop:9092,nn2.hadoop:9092,s1.hadoop:9092",
    "group.id": "testGroup"
}
st_env.connect(Kafka().properties(properties).version("universal").topic("test").start_from_latest()) \
    .with_format(Json()).with_schema(Schema() \
        .field('throughputReqMax', DataTypes.BIGINT()) \
        .field('throughputReqTotal', DataTypes.BIGINT())) \
    .create_temporary_table('mySource')

#写入csv
st_env.connect(FileSystem().path('/usr/local/flink/test/result3.txt')) \
    .with_format(OldCsv()
                .field('sub', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('sub', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

#读取kafka数据中的a和b字段相加再乘以2 , 并插入sink
st_env.from_path('mySource')\
    .select("(throughputReqTotal-throughputReqMax)") \