def test_watermarks_periodic_bounded(self): rowtime = Rowtime().watermarks_periodic_bounded(1000) properties = rowtime.to_properties() expected = {'rowtime.watermarks.type': 'periodic-bounded', 'rowtime.watermarks.delay': '1000'} self.assertEqual(expected, properties)
def test_timestamps_from_field(self): rowtime = Rowtime().timestamps_from_field("rtime") properties = rowtime.to_properties() expected = { 'rowtime.timestamps.type': 'from-field', 'rowtime.timestamps.from': 'rtime' } self.assertEqual(expected, properties)
def test_watermarks_from_strategy(self): rowtime = Rowtime().watermarks_from_strategy( "org.apache.flink.table.legacyutils.CustomAssigner") properties = rowtime.to_properties() expected = { 'rowtime.watermarks.type': 'custom', 'rowtime.watermarks.class': 'org.apache.flink.table.legacyutils.CustomAssigner', 'rowtime.watermarks.serialized': 'rO0ABXNyADFvcmcuYXBhY2hlLmZsaW5rLnRhYmxlLmxlZ2FjeXV0aWxzLkN1c3RvbUFzc2lnbmVyu_8' 'TLNBQBsACAAB4cgBHb3JnLmFwYWNoZS5mbGluay50YWJsZS5zb3VyY2VzLndtc3RyYXRlZ2llcy5QdW' '5jdHVhdGVkV2F0ZXJtYXJrQXNzaWduZXKBUc57oaWu9AIAAHhyAD1vcmcuYXBhY2hlLmZsaW5rLnRhY' 'mxlLnNvdXJjZXMud21zdHJhdGVnaWVzLldhdGVybWFya1N0cmF0ZWd53nt-g2OWaT4CAAB4cA'} self.assertEqual(expected, properties)
def test_timestamps_from_extractor(self): rowtime = Rowtime().timestamps_from_extractor( "org.apache.flink.table.descriptors.RowtimeTest$CustomExtractor") properties = rowtime.to_properties() expected = { 'rowtime.timestamps.type': 'custom', 'rowtime.timestamps.class': 'org.apache.flink.table.descriptors.RowtimeTest$CustomExtractor', 'rowtime.timestamps.serialized': 'rO0ABXNyAD5vcmcuYXBhY2hlLmZsaW5rLnRhYmxlLmRlc2NyaXB0b3JzLlJvd3RpbWVUZXN0JEN1c3R' 'vbUV4dHJhY3RvcoaChjMg55xwAgABTAAFZmllbGR0ABJMamF2YS9sYW5nL1N0cmluZzt4cgA-b3JnLm' 'FwYWNoZS5mbGluay50YWJsZS5zb3VyY2VzLnRzZXh0cmFjdG9ycy5UaW1lc3RhbXBFeHRyYWN0b3Jf1' 'Y6piFNsGAIAAHhwdAACdHM'} self.assertEqual(expected, properties)
def test_timestamps_from_extractor(self): rowtime = Rowtime().timestamps_from_extractor( "org.apache.flink.table.legacyutils.CustomExtractor") properties = rowtime.to_properties() expected = { 'rowtime.timestamps.type': 'custom', 'rowtime.timestamps.class': 'org.apache.flink.table.legacyutils.CustomExtractor', 'rowtime.timestamps.serialized': 'rO0ABXNyADJvcmcuYXBhY2hlLmZsaW5rLnRhYmxlLmxlZ2FjeXV0aWxzLkN1c3RvbUV4dHJhY3Rvctj' 'ZLTGK9XvxAgABTAAFZmllbGR0ABJMamF2YS9sYW5nL1N0cmluZzt4cgA-b3JnLmFwYWNoZS5mbGluay' '50YWJsZS5zb3VyY2VzLnRzZXh0cmFjdG9ycy5UaW1lc3RhbXBFeHRyYWN0b3Jf1Y6piFNsGAIAAHhwd' 'AACdHM'} self.assertEqual(expected, properties)
def test_rowtime(self): schema = Schema()\ .field("int_field", DataTypes.INT())\ .field("long_field", DataTypes.BIGINT())\ .field("rtime", DataTypes.BIGINT())\ .rowtime( Rowtime().timestamps_from_field("long_field").watermarks_periodic_bounded(5000))\ .field("string_field", DataTypes.STRING()) properties = schema.to_properties() print(properties) expected = { 'schema.0.name': 'int_field', 'schema.0.data-type': 'INT', 'schema.1.name': 'long_field', 'schema.1.data-type': 'BIGINT', 'schema.2.name': 'rtime', 'schema.2.data-type': 'BIGINT', 'schema.2.rowtime.timestamps.type': 'from-field', 'schema.2.rowtime.timestamps.from': 'long_field', 'schema.2.rowtime.watermarks.type': 'periodic-bounded', 'schema.2.rowtime.watermarks.delay': '5000', 'schema.3.name': 'string_field', 'schema.3.data-type': 'VARCHAR(2147483647)' } self.assertEqual(expected, properties)
def register_rides_source(st_env): st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("Rides") .start_from_earliest() .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092")) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("rideId", DataTypes.BIGINT()), DataTypes.FIELD("isStart", DataTypes.BOOLEAN()), DataTypes.FIELD("eventTime", DataTypes.TIMESTAMP()), DataTypes.FIELD("lon", DataTypes.FLOAT()), DataTypes.FIELD("lat", DataTypes.FLOAT()), DataTypes.FIELD("psgCnt", DataTypes.INT()), DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \ .with_schema( # declare the schema of the table Schema() .field("rideId", DataTypes.BIGINT()) .field("taxiId", DataTypes.BIGINT()) .field("isStart", DataTypes.BOOLEAN()) .field("lon", DataTypes.FLOAT()) .field("lat", DataTypes.FLOAT()) .field("psgCnt", DataTypes.INT()) .field("rideTime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("eventTime") .watermarks_periodic_bounded(60000))) \ .in_append_mode() \ .register_table_source("source")
def test_watermarks_from_strategy(self): rowtime = Rowtime().watermarks_from_strategy( "org.apache.flink.table.descriptors.RowtimeTest$CustomAssigner") properties = rowtime.to_properties() expected = { 'rowtime.watermarks.type': 'custom', 'rowtime.watermarks.class': 'org.apache.flink.table.descriptors.RowtimeTest$CustomAssigner', 'rowtime.watermarks.serialized': 'rO0ABXNyAD1vcmcuYXBhY2hlLmZsaW5rLnRhYmxlLmRlc2NyaXB0b3JzLlJvd3RpbWVUZXN0JEN1c3R' 'vbUFzc2lnbmVyeDcuDvfbu0kCAAB4cgBHb3JnLmFwYWNoZS5mbGluay50YWJsZS5zb3VyY2VzLndtc3' 'RyYXRlZ2llcy5QdW5jdHVhdGVkV2F0ZXJtYXJrQXNzaWduZXKBUc57oaWu9AIAAHhyAD1vcmcuYXBhY' '2hlLmZsaW5rLnRhYmxlLnNvdXJjZXMud21zdHJhdGVnaWVzLldhdGVybWFya1N0cmF0ZWd53nt-g2OW' 'aT4CAAB4cA'} self.assertEqual(expected, properties)
def register_transactions_source(st_env): st_env.connect(Kafka() .version("universal") .topic("transactions-data") .start_from_latest() .property("zookeeper.connect", "host.docker.internal:2181") .property("bootstrap.servers", "host.docker.internal:19091")) \ .with_format(Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("customer", DataTypes.STRING()), DataTypes.FIELD("transaction_type", DataTypes.STRING()), DataTypes.FIELD("online_payment_amount", DataTypes.DOUBLE()), DataTypes.FIELD("in_store_payment_amount", DataTypes.DOUBLE()), DataTypes.FIELD("lat", DataTypes.DOUBLE()), DataTypes.FIELD("lon", DataTypes.DOUBLE()), DataTypes.FIELD("transaction_datetime", DataTypes.TIMESTAMP())]))) \ .with_schema(Schema() .field("customer", DataTypes.STRING()) .field("transaction_type", DataTypes.STRING()) .field("online_payment_amount", DataTypes.DOUBLE()) .field("in_store_payment_amount", DataTypes.DOUBLE()) .field("lat", DataTypes.DOUBLE()) .field("lon", DataTypes.DOUBLE()) .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("transaction_datetime") .watermarks_periodic_bounded(60000))) \ .in_append_mode() \ .register_table_source("source")
def test_watermarks_from_strategy(self): rowtime = Rowtime().watermarks_from_strategy( "org.apache.flink.table.utils.TestingDescriptors$CustomAssigner") properties = rowtime.to_properties() expected = { 'rowtime.watermarks.type': 'custom', 'rowtime.watermarks.class': 'org.apache.flink.table.utils.TestingDescriptors$CustomAssigner', 'rowtime.watermarks.serialized': 'rO0ABXNyAD5vcmcuYXBhY2hlLmZsaW5rLnRhYmxlLnV0aWxzLlRlc3RpbmdEZXNjcmlwdG9ycyRDdXN0b2' '1Bc3NpZ25lcsY_Xt96bBjDAgAAeHIAR29yZy5hcGFjaGUuZmxpbmsudGFibGUuc291cmNlcy53bXN0cmF0' 'ZWdpZXMuUHVuY3R1YXRlZFdhdGVybWFya0Fzc2lnbmVygVHOe6GlrvQCAAB4cgA9b3JnLmFwYWNoZS5mbG' 'luay50YWJsZS5zb3VyY2VzLndtc3RyYXRlZ2llcy5XYXRlcm1hcmtTdHJhdGVned57foNjlmk-AgAAeHA' } self.assertEqual(expected, properties)
def test_timestamps_from_extractor(self): rowtime = Rowtime().timestamps_from_extractor( "org.apache.flink.table.utils.TestingDescriptors$CustomExtractor") properties = rowtime.to_properties() expected = { 'rowtime.timestamps.type': 'custom', 'rowtime.timestamps.class': 'org.apache.flink.table.utils.TestingDescriptors$CustomExtractor', 'rowtime.timestamps.serialized': 'rO0ABXNyAD9vcmcuYXBhY2hlLmZsaW5rLnRhYmxlLnV0aWxzLlRlc3RpbmdEZXNjcmlwdG9ycyRDdXN0b2' '1FeHRyYWN0b3K-MntVKO8Z7QIAAUwABWZpZWxkdAASTGphdmEvbGFuZy9TdHJpbmc7eHIAPm9yZy5hcGFj' 'aGUuZmxpbmsudGFibGUuc291cmNlcy50c2V4dHJhY3RvcnMuVGltZXN0YW1wRXh0cmFjdG9yX9WOqYhTbB' 'gCAAB4cHQAAnRz' } self.assertEqual(expected, properties)
def distinct_agg_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("time") .watermarks_periodic_bounded(60000)) .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("Orders") st_env.connect( Elasticsearch() .version("6") .host("localhost", 9200, "http") .index("distinct_agg_streaming") .document_type('pyflink') .key_delimiter("_") .key_null_literal("null") .failure_handler_ignore() .disable_flush_on_checkpoint() .bulk_flush_max_actions(2) .bulk_flush_max_size("1 mb") .bulk_flush_interval(5000) ) \ .with_schema( Schema() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) ) \ .with_format( Json() .derive_schema() ) \ .in_upsert_mode() \ .register_table_sink("result") orders = st_env.scan("Orders") result = orders.window(Tumble.over("30.minutes").on("rowtime").alias("w")) \ .group_by("a, w").select("a, b.max.distinct as d") result.insert_into("result") st_env.execute("distinct agg streaming")
def tumble_time_window_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/tumble_time_window_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("time") .watermarks_periodic_bounded(60000)) .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("source") st_env.register_table_sink("result", CsvTableSink(["a", "b"], [DataTypes.STRING(), DataTypes.STRING()], result_file)) st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("w, a") \ .select("a, max(b)").insert_into("result") st_env.execute("tumble time window streaming")
def test_timestamps_from_source(self): rowtime = Rowtime().timestamps_from_source() properties = rowtime.to_properties() expected = {'rowtime.timestamps.type': 'from-source'} self.assertEqual(expected, properties)
def pv_uv_demo(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user_behavior") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " user_id: {" " type: 'string'" " }," " item_id: {" " type: 'string'" " }," " category_id: {" " type: 'string'" " }," " behavior: {" " type: 'string'" " }," " ts: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("user_id", DataTypes.STRING()) .field("item_id", DataTypes.STRING()) .field("category_id", DataTypes.STRING()) .field("behavior", DataTypes.STRING()) .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("ts") .watermarks_periodic_bounded(60000)) ) \ .in_append_mode() \ .register_table_source("source") # use custom retract sink connector custom_connector = CustomConnectorDescriptor('jdbc', 1, False) \ .property("connector.driver", "org.apache.derby.jdbc.ClientDriver") \ .property("connector.url", "jdbc:derby://localhost:1527/firstdb") \ .property("connector.table", "pv_uv_table") \ .property("connector.write.flush.max-rows", "1") st_env.connect(custom_connector) \ .with_schema( Schema() .field("startTime", DataTypes.TIMESTAMP()) .field("endTime", DataTypes.TIMESTAMP()) .field("pv", DataTypes.BIGINT()) .field("uv", DataTypes.BIGINT()) ).register_table_sink("sink") st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("w") \ .select("w.start as startTime, w.end as endTime, COUNT(1) as pv, user_id.count.distinct as uv").insert_into("sink") st_env.execute("table pv uv")
def test_watermarks_periodic_ascending(self): rowtime = Rowtime().watermarks_periodic_ascending() properties = rowtime.to_properties() expected = {'rowtime.watermarks.type': 'periodic-ascending'} self.assertEqual(expected, properties)
" type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("time") .watermarks_periodic_bounded(60000)) .field("a", DataTypes.STRING()) .field("b", DataTypes.DECIMAL(38,12,nullable=True)) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("source") # way sink csv # st_env.register_table_sink("result_tab", # CsvTableSink(["a", "count"], # [DataTypes.STRING(), # DataTypes.DECIMAL(38, 12, nullable=True)], # result_file))