def register_sink(st_env, index_name): st_env \ .connect( Elasticsearch() .version("7") .host("localhost", 9200, "http") .index(index_name) .document_type('pyflink') .key_delimiter("_") .key_null_literal("null") .failure_handler_ignore() .disable_flush_on_checkpoint() .bulk_flush_max_actions(42) .bulk_flush_max_size("42 mb") .bulk_flush_interval(3000) .bulk_flush_backoff_constant() .bulk_flush_backoff_max_retries(3) .bulk_flush_backoff_delay(3000) .connection_max_retry_timeout(3)) \ .with_schema( Schema() .field("a", DataTypes.INT())) \ .with_format( Json() .schema(DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT())]))) \ .in_upsert_mode() \ .create_temporary_table("sink")
def register_rides_source(st_env): st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("Rides") .start_from_earliest() .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092")) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("rideId", DataTypes.BIGINT()), DataTypes.FIELD("isStart", DataTypes.BOOLEAN()), DataTypes.FIELD("eventTime", DataTypes.TIMESTAMP()), DataTypes.FIELD("lon", DataTypes.FLOAT()), DataTypes.FIELD("lat", DataTypes.FLOAT()), DataTypes.FIELD("psgCnt", DataTypes.INT()), DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \ .with_schema( # declare the schema of the table Schema() .field("rideId", DataTypes.BIGINT()) .field("taxiId", DataTypes.BIGINT()) .field("isStart", DataTypes.BOOLEAN()) .field("lon", DataTypes.FLOAT()) .field("lat", DataTypes.FLOAT()) .field("psgCnt", DataTypes.INT()) .field("rideTime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("eventTime") .watermarks_periodic_bounded(60000))) \ .in_append_mode() \ .register_table_source("source")
def register_rides_source(st_env): st_env \ .connect( # declare the external system to connect to Kafka() .version("universal") .topic("Rides") .start_from_earliest() .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092")) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("rideId", DataTypes.BIGINT()), DataTypes.FIELD("isStart", DataTypes.BOOLEAN()), DataTypes.FIELD("eventTime", DataTypes.STRING()), DataTypes.FIELD("lon", DataTypes.FLOAT()), DataTypes.FIELD("lat", DataTypes.FLOAT()), DataTypes.FIELD("psgCnt", DataTypes.INT()), DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \ .with_schema( # declare the schema of the table Schema() .field("rideId", DataTypes.BIGINT()) .field("taxiId", DataTypes.BIGINT()) .field("isStart", DataTypes.BOOLEAN()) .field("lon", DataTypes.FLOAT()) .field("lat", DataTypes.FLOAT()) .field("psgCnt", DataTypes.INT()) .field("eventTime", DataTypes.STRING())) \ .in_append_mode() \ .create_temporary_table("source")
def register_rides_sink(st_env): st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("TempResults") .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092")) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("rideId", DataTypes.BIGINT()), DataTypes.FIELD("taxiId", DataTypes.BIGINT()), DataTypes.FIELD("isStart", DataTypes.BOOLEAN()), DataTypes.FIELD("lon", DataTypes.FLOAT()), DataTypes.FIELD("lat", DataTypes.FLOAT()), DataTypes.FIELD("psgCnt", DataTypes.INT()), DataTypes.FIELD("rideTime", DataTypes.TIMESTAMP()) ]))) \ .with_schema( # declare the schema of the table Schema() .field("rideId", DataTypes.BIGINT()) .field("taxiId", DataTypes.BIGINT()) .field("isStart", DataTypes.BOOLEAN()) .field("lon", DataTypes.FLOAT()) .field("lat", DataTypes.FLOAT()) .field("psgCnt", DataTypes.INT()) .field("rideTime", DataTypes.TIMESTAMP())) \ .in_append_mode() \ .register_table_sink("sink")
def register_transactions_source(st_env): st_env.connect(Kafka() .version("universal") .topic("transactions-data") .start_from_latest() .property("zookeeper.connect", "host.docker.internal:2181") .property("bootstrap.servers", "host.docker.internal:19091")) \ .with_format(Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("customer", DataTypes.STRING()), DataTypes.FIELD("transaction_type", DataTypes.STRING()), DataTypes.FIELD("online_payment_amount", DataTypes.DOUBLE()), DataTypes.FIELD("in_store_payment_amount", DataTypes.DOUBLE()), DataTypes.FIELD("lat", DataTypes.DOUBLE()), DataTypes.FIELD("lon", DataTypes.DOUBLE()), DataTypes.FIELD("transaction_datetime", DataTypes.TIMESTAMP())]))) \ .with_schema(Schema() .field("customer", DataTypes.STRING()) .field("transaction_type", DataTypes.STRING()) .field("online_payment_amount", DataTypes.DOUBLE()) .field("in_store_payment_amount", DataTypes.DOUBLE()) .field("lat", DataTypes.DOUBLE()) .field("lon", DataTypes.DOUBLE()) .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("transaction_datetime") .watermarks_periodic_bounded(60000))) \ .in_append_mode() \ .register_table_source("source")
def test_fail_on_missing_field_true(self): json = Json().fail_on_missing_field(True) expected = {'format.fail-on-missing-field': 'true', 'format.property-version': '1', 'format.type': 'json'} properties = json.to_properties() self.assertEqual(expected, properties)
def test_ignore_parse_errors(self): json = Json().ignore_parse_errors(True) expected = {'format.ignore-parse-errors': 'true', 'format.property-version': '1', 'format.type': 'json'} properties = json.to_properties() self.assertEqual(expected, properties)
def test_derive_schema(self): json = Json().derive_schema() expected = {'format.derive-schema': 'true', 'format.property-version': '1', 'format.type': 'json'} properties = json.to_properties() self.assertEqual(expected, properties)
def test_schema(self): json = Json().schema(DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD("b", DataTypes.STRING())])) expected = {'format.schema': 'ROW<a INT, b VARCHAR>', 'format.property-version': '1', 'format.type': 'json'} properties = json.to_properties() self.assertEqual(expected, properties)
def group_by_agg_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" st_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) st_env.connect( Elasticsearch() .version("6") .host("localhost", 9200, "http") .index("group_by_agg_streaming") .document_type('pyflink') .key_delimiter("_") .key_null_literal("null") .failure_handler_ignore() .disable_flush_on_checkpoint() .bulk_flush_max_actions(2) .bulk_flush_max_size("1 mb") .bulk_flush_interval(5000) ) \ .with_schema( Schema() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) ) \ .with_format( Json() .derive_schema() ) \ .in_upsert_mode() \ .register_table_sink("result") orders = st_env.scan("Orders") groub_by_table = orders.group_by("a").select("a, b.sum as d") # Because the schema of index user in elasticsearch is # {"a":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}, # "b":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}} # so we need to cast the type in our demo. st_env.register_table("group_table", groub_by_table) result = st_env.sql_query("SELECT a, CAST(d AS VARCHAR) from group_table") result.insert_into("result") st_env.execute("group by agg streaming")
def register_transactions_es_sink(st_env): st_env.connect(Elasticsearch() .version("7") .host("localhost", 9200, "http") .index("account-activity") ) \ .with_schema(Schema() .field("event_id", DataTypes.STRING()) .field("account_id", DataTypes.DOUBLE()) .field("event_type", DataTypes.STRING()) .field("location_country", DataTypes.STRING()) .field("event_timestamp", DataTypes.TIMESTAMP(precision=3))) \ .with_format(Json().derive_schema()).in_upsert_mode().create_temporary_table("sink_elasticsearch")
def register_transactions_es_sink(st_env): st_env.connect(Elasticsearch() .version("6") .host("0.0.0.0", 9200, "http") .index("transactions-supermarket-case") .document_type("usage")) \ .with_schema(Schema() .field("customer", DataTypes.STRING()) .field("count_transactions", DataTypes.STRING()) .field("total_online_payment_amount", DataTypes.DOUBLE()) .field('total_in_store_payment_amount', DataTypes.DOUBLE()) .field("lon", DataTypes.FLOAT()) .field("lat", DataTypes.FLOAT()) .field('last_transaction_time', DataTypes.STRING()) ) \ .with_format(Json().derive_schema()).in_upsert_mode().register_table_sink("sink_elasticsearch")
def register_cnt_sink(st_env): st_env.connect( Elasticsearch() .version("6") .host("elasticsearch", 9200, "http") .index("area-cnts") .document_type('areacnt') .key_delimiter("$")) \ .with_schema( Schema() .field("???", ???) .field("???", DataTypes.BIGINT())) \ .with_format( Json() .derive_schema()) \ .in_upsert_mode() \ .register_table_sink("sink")
def register_source(st_env): st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("performance_source") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092")) \ .with_format( # declare a format for this system Json() .schema(DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT())])) .fail_on_missing_field(True)) \ .with_schema( # declare the schema of the table Schema() .field("a", DataTypes.INT())) \ .in_append_mode() \ .create_temporary_table("source")
def register_ride_duration_sink(st_env): st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("TempResults") .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092")) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("rideId", DataTypes.BIGINT()), DataTypes.FIELD("durationMin", DataTypes.BIGINT()) ]))) \ .with_schema( # declare the schema of the table Schema() .field("rideId", DataTypes.BIGINT()) .field("durationMin", DataTypes.BIGINT())) \ .in_append_mode() \ .register_table_sink("TempResults")
def test_json_schema(self): json = Json().json_schema("{" "'title': 'Fruit'," "'type': 'object'," "'properties': " "{" "'name': {'type': 'string'}," "'count': {'type': 'integer'}," "'time': " "{" "'description': 'row time'," "'type': 'string'," "'format': 'date-time'" "}" "}," "'required': ['name', 'count', 'time']" "}") expected = { 'format.json-schema': "{" "'title': 'Fruit'," "'type': 'object'," "'properties': {" "'name': {'type': 'string'}," "'count': {'type': 'integer'}," "'time': {" "'description': 'row time'," "'type': 'string'," "'format': 'date-time'}" "}," "'required': ['name', 'count', 'time']}", 'format.property-version': '1', 'format.type': 'json' } properties = json.to_properties() self.assertEqual(expected, properties)
def register_transactions_source(st_env): st_env.connect(Kafka() .version("universal") .topic("server-logs") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092")) \ .with_format(Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("event_id", DataTypes.STRING()), DataTypes.FIELD("account_id", DataTypes.DOUBLE()), DataTypes.FIELD("event_type", DataTypes.DOUBLE()), DataTypes.FIELD("location_country", DataTypes.DOUBLE()), DataTypes.FIELD("event_timestamp", DataTypes.TIMESTAMP(precision=3))]))) \ .with_schema(Schema() .field("event_id", DataTypes.STRING()) .field("account_id", DataTypes.DOUBLE()) .field("event_type", DataTypes.STRING()) .field("location_country", DataTypes.STRING()) .field("event_timestamp", DataTypes.TIMESTAMP(precision=3))) \ .in_append_mode() \ .create_temporary_table("source")
from pyflink.table.descriptors import Kafka, Json, OldCsv, Schema, FileSystem directories = ['/flink/lib'] for directory in directories: for jar in glob.glob(os.path.join(directory, '*.jar')): sys.path.append(jar) # from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer11 # from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer09 OldCsv() print("debug 010") Kafka() print("debug 020") Json() print("debug 030") sourcetable = table_env \ .connect(Kafka() .properties({'update-mode': 'append', 'connector.topic': 'machine.data', 'connector.properties.zookeeper.connect': 'localhost:2181', 'connector.properties.bootstrap.servers.': 'localhost:9092'})) \ .with_format(Json(). json_schema( "{type:'object',properties:{thing: {type: 'string'},quantity:{type:'string'},phenomenonTime:{type:'integer'},result:{type:'number'}}}") \ .fail_on_missing_field(False)) \ .with_schema(Schema() .field("thing", DataTypes.STRING()) .field("quantity", DataTypes.STRING())
.use_blink_planner().build()) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("Rides") .start_from_earliest() .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092")) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("rideId", DataTypes.BIGINT()), DataTypes.FIELD("isStart", DataTypes.BOOLEAN()), DataTypes.FIELD("eventTime", DataTypes.TIMESTAMP()), DataTypes.FIELD("lon", DataTypes.FLOAT()), DataTypes.FIELD("lat", DataTypes.FLOAT()), DataTypes.FIELD("psgCnt", DataTypes.INT()), DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \ .with_schema( # declare the schema of the table Schema() .field("rideId", DataTypes.BIGINT()) .field("taxiId", DataTypes.BIGINT()) .field("isStart", DataTypes.BOOLEAN()) .field("lon", DataTypes.FLOAT()) .field("lat", DataTypes.FLOAT()) .field("psgCnt", DataTypes.INT()) .field("rideTime", DataTypes.TIMESTAMP()) .rowtime( Rowtime()
.start_from_specific_offset(0,496) .property("zookeeper.connect", "6.86.2.170:2181") .property("bootstrap.servers", "6.86.2.170:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'number'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("rowtime", DataTypes.TIMESTAMP()) .rowtime(
Kafka() .version("0.11") .topic("input") .start_from_earliest() .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " timestamp: {" " type: 'string'" " }," " page: {" " type: 'string'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("timestamp", DataTypes.TIMESTAMP()).proctime() .field("page", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("ClickEvent Source")
st_env.register_function('predict', predict) st_env.connect( Kafka() .version('universal') .topic('test') .start_from_earliest() .property('zookeeper.connect', 'zookeeper:2181') .property('bootstrap.servers', 'kafka:9092') ).with_format( Json() .fail_on_missing_field(True) .schema( DataTypes.ROW( [ DataTypes.FIELD('datetime', DataTypes.STRING()), DataTypes.FIELD('text', DataTypes.STRING()), ] ) ) ).with_schema( Schema() .field('datetime', DataTypes.STRING()) .field('text', DataTypes.STRING()) ).in_append_mode().register_table_source( 'source' ) result_path = '/notebooks/output-tensorflow.csv'
def distinct_agg_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("time") .watermarks_periodic_bounded(60000)) .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("Orders") st_env.connect( Elasticsearch() .version("6") .host("localhost", 9200, "http") .index("distinct_agg_streaming") .document_type('pyflink') .key_delimiter("_") .key_null_literal("null") .failure_handler_ignore() .disable_flush_on_checkpoint() .bulk_flush_max_actions(2) .bulk_flush_max_size("1 mb") .bulk_flush_interval(5000) ) \ .with_schema( Schema() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) ) \ .with_format( Json() .derive_schema() ) \ .in_upsert_mode() \ .register_table_sink("result") orders = st_env.scan("Orders") result = orders.window(Tumble.over("30.minutes").on("rowtime").alias("w")) \ .group_by("a, w").select("a, b.max.distinct as d") result.insert_into("result") st_env.execute("distinct agg streaming")
def tumble_time_window_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/tumble_time_window_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("time") .watermarks_periodic_bounded(60000)) .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("source") st_env.register_table_sink("result", CsvTableSink(["a", "b"], [DataTypes.STRING(), DataTypes.STRING()], result_file)) st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("w, a") \ .select("a, max(b)").insert_into("result") st_env.execute("tumble time window streaming")
def pv_uv_demo(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user_behavior") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " user_id: {" " type: 'string'" " }," " item_id: {" " type: 'string'" " }," " category_id: {" " type: 'string'" " }," " behavior: {" " type: 'string'" " }," " ts: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("user_id", DataTypes.STRING()) .field("item_id", DataTypes.STRING()) .field("category_id", DataTypes.STRING()) .field("behavior", DataTypes.STRING()) .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("ts") .watermarks_periodic_bounded(60000)) ) \ .in_append_mode() \ .register_table_source("source") # use custom retract sink connector custom_connector = CustomConnectorDescriptor('jdbc', 1, False) \ .property("connector.driver", "org.apache.derby.jdbc.ClientDriver") \ .property("connector.url", "jdbc:derby://localhost:1527/firstdb") \ .property("connector.table", "pv_uv_table") \ .property("connector.write.flush.max-rows", "1") st_env.connect(custom_connector) \ .with_schema( Schema() .field("startTime", DataTypes.TIMESTAMP()) .field("endTime", DataTypes.TIMESTAMP()) .field("pv", DataTypes.BIGINT()) .field("uv", DataTypes.BIGINT()) ).register_table_sink("sink") st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("w") \ .select("w.start as startTime, w.end as endTime, COUNT(1) as pv, user_id.count.distinct as uv").insert_into("sink") st_env.execute("table pv uv")
st_env = StreamTableEnvironment.create(s_env, environment_settings=env_settings) st_env.get_config().get_configuration().set_string( "pipeline.jars", "file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka-base_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-jdbc_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-sql-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/kafka-clients-2.1.0.jar" ) #读kafka properties = { "zookeeper.connect": "nn1.hadoop:2181,nn2.hadoop:2181,s1.hadoop:2181", "bootstrap.servers": "nn1.hadoop:9092,nn2.hadoop:9092,s1.hadoop:9092", "group.id": "testGroup" } st_env.connect(Kafka().properties(properties).version("universal").topic("test").start_from_latest()) \ .with_format(Json()).with_schema(Schema() \ .field('throughputReqMax', DataTypes.BIGINT()) \ .field('throughputReqTotal', DataTypes.BIGINT())) \ .create_temporary_table('mySource') #写入csv st_env.connect(FileSystem().path('/usr/local/flink/test/result3.txt')) \ .with_format(OldCsv() .field('sub', DataTypes.BIGINT())) \ .with_schema(Schema() .field('sub', DataTypes.BIGINT())) \ .create_temporary_table('mySink') #读取kafka数据中的a和b字段相加再乘以2 , 并插入sink st_env.from_path('mySource')\ .select("(throughputReqTotal-throughputReqMax)") \