def test_with_schema(self): descriptor = self.t_env.connect(FileSystem()) descriptor = descriptor.with_format(OldCsv()).with_schema( Schema().field("a", "INT")) properties = descriptor.to_properties() expected = { 'schema.0.name': 'a', 'schema.0.data-type': 'INT', 'format.type': 'csv', 'format.property-version': '1', 'connector.type': 'filesystem', 'connector.property-version': '1' } assert properties == expected
def test_in_append_mode(self): descriptor = self.t_env.connect(FileSystem()) descriptor = descriptor\ .with_format(OldCsv())\ .in_append_mode() properties = descriptor.to_properties() expected = { 'update-mode': 'append', 'format.type': 'csv', 'format.property-version': '1', 'connector.property-version': '1', 'connector.type': 'filesystem' } assert properties == expected
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute("word_count")
def test_field(self): csv = OldCsv() csv.field("a", DataTypes.BIGINT()) csv.field("b", DataTypes.STRING()) csv.field("c", "SQL_TIMESTAMP") properties = csv.to_properties() expected = {'format.fields.0.name': 'a', 'format.fields.0.data-type': 'BIGINT', 'format.fields.1.name': 'b', 'format.fields.1.data-type': 'VARCHAR(2147483647)', 'format.fields.2.name': 'c', 'format.fields.2.data-type': 'TIMESTAMP(3)', 'format.type': 'csv', 'format.property-version': '1'} self.assertEqual(expected, properties)
def test_schema(self): csv = OldCsv() schema = TableSchema(["a", "b"], [DataTypes.INT(), DataTypes.STRING()]) csv = csv.schema(schema) properties = csv.to_properties() expected = { 'format.fields.0.name': 'a', 'format.fields.0.data-type': 'INT', 'format.fields.1.name': 'b', 'format.fields.1.data-type': 'VARCHAR(2147483647)', 'format.type': 'csv', 'format.property-version': '1' } self.assertEqual(expected, properties)
def execute(self, function_context: FlinkFunctionContext, input_table: Table) -> None: example_meta: af.ExampleMeta = function_context.get_example_meta() output_file = example_meta.batch_uri if os.path.exists(output_file): os.remove(output_file) t_env = function_context.get_table_env() statement_set = function_context.get_statement_set() t_env.connect(FileSystem().path(output_file)) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') statement_set.add_insert('mySink', input_table)
def process(self, execution_context: flink.ExecutionContext, input_list: List[Table] = None) -> List[Table]: output_file = os.path.join(os.getcwd(), 'output') if os.path.exists(output_file): os.remove(output_file) t_env = execution_context.table_env statement_set = execution_context.statement_set t_env.connect(FileSystem().path(output_file)) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') statement_set.add_insert('mySink', input_list[0]) return []
def word_count(): f1 = open("/home/mnm/flink-1.9.1/1", "r") f2 = open("/home/mnm/flink-1.9.1/2", "r") f3 = open("/home/mnm/flink-1.9.1/3", "r") f4 = open("/home/mnm/flink-1.9.1/4", "r") f5 = open("/home/mnm/flink-1.9.1/5", "r") content = f1.read() + f2.read() + f3.read() + f4.read() + f5.read() t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute("Python batch word count")
class main(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # t_env.connect(FileSystem().path('./temp/deviceorientation')) \ # .with_format(OldCsv() # .field('word', DataTypes.STRING())) \ # .with_schema(Schema() # .field('word', DataTypes.STRING())) \ # .create_temporary_table('mySource') my_source_ddl = """ create table mySource ( word VARCHAR ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = './temp/input' ) """ t_env.sql_update(my_source_ddl) t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') t_env.from_path('mySource') \ .group_by('word') \ .select('word, count(1)') \ .insert_into('mySink') t_env.execute("tutorial_job")
def execute(self, function_context: FlinkFunctionContext, input_list: List[Table]) -> List[Table]: t_env = function_context.get_table_env() statement_set = function_context.get_statement_set() table = input_list[0] t_env.register_function( "build_index", udf(BuildIndexUDF(self.path, self.element_type, self.dimension), [DataTypes.STRING(), DataTypes.STRING()], DataTypes.STRING())) dummy_output_path = '/tmp/indexed_key' if os.path.exists(dummy_output_path): if os.path.isdir(dummy_output_path): shutil.rmtree(dummy_output_path) else: os.remove(dummy_output_path) t_env.connect(FileSystem().path(dummy_output_path)) \ .with_format(OldCsv() .field('key', DataTypes.STRING())) \ .with_schema(Schema() .field('key', DataTypes.STRING())) \ .create_temporary_table('train_sink') statement_set.add_insert( "train_sink", table.select("build_index(uuid, feature_data)")) return []
) ).with_schema( Schema() .field('datetime', DataTypes.STRING()) .field('text', DataTypes.STRING()) ).in_append_mode().register_table_source( 'source' ) result_path = '/notebooks/output-tensorflow.csv' t_env.connect(FileSystem().path(result_path)).with_format( OldCsv() .field_delimiter(',') .field('datetime', DataTypes.STRING()) .field('sentence', DataTypes.STRING()) .field('label', DataTypes.STRING()) ).with_schema( Schema() .field('datetime', DataTypes.STRING()) .field('sentence', DataTypes.STRING()) .field('label', DataTypes.STRING()) ).in_append_mode().register_table_sink( 'sink' ) st_env.from_path('source').select( 'datetime, sentence, predict(sentence)' ).insert_into('sink')
table_config.set_null_check(False) # Create table environment where the source and sink tables will be registered table_env = StreamTableEnvironment.create(env, table_config) from pyflink.table.descriptors import Kafka, Json, OldCsv, Schema, FileSystem directories = ['/flink/lib'] for directory in directories: for jar in glob.glob(os.path.join(directory, '*.jar')): sys.path.append(jar) # from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer11 # from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer09 OldCsv() print("debug 010") Kafka() print("debug 020") Json() print("debug 030") sourcetable = table_env \ .connect(Kafka() .properties({'update-mode': 'append', 'connector.topic': 'machine.data', 'connector.properties.zookeeper.connect': 'localhost:2181', 'connector.properties.bootstrap.servers.': 'localhost:9092'})) \ .with_format(Json(). json_schema(
from pyflink.dataset import ExecutionEnvironment from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment from pyflink.table.descriptors import Schema, OldCsv, FileSystem exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(2) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) t_env.connect(FileSystem().path('input')) \ .with_format(OldCsv() .line_delimiter(' ') .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .register_table_source("inputSource") t_env.connect(FileSystem().path('output')) \ .with_format(OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\ .with_schema(Schema().field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\ .register_table_sink('sink') t_env.scan('inputSource').group_by('word').select('word, count(1)').insert_into('sink') t_env.execute('my first job')
def word_count(): environment_settings = EnvironmentSettings.new_instance().in_batch_mode( ).use_blink_planner().build() t_env = BatchTableEnvironment.create( environment_settings=environment_settings) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) # we should set the Python verison here if `Python` not point t_env.get_config().set_python_executable("python3") t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("city", DataTypes.STRING()) .field("sales_volume", DataTypes.BIGINT()) .field("sales", DataTypes.BIGINT())) \ .with_schema(Schema() .field("city", DataTypes.STRING()) .field("sales_volume", DataTypes.BIGINT()) .field("sales", DataTypes.BIGINT())) \ .register_table_sink("Results") @udf(input_types=DataTypes.STRING(), result_type=DataTypes.ARRAY(DataTypes.STRING())) def split(input_str: str): return input_str.split(",") @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()), DataTypes.INT()], result_type=DataTypes.STRING()) def get(arr, index): return arr[index] t_env.register_function("split", split) t_env.register_function("get", get) t_env.get_config().get_configuration().set_string("parallelism.default", "1") data = [ ("iPhone 11,30,5499,Beijing", ), ("iPhone 11 Pro,20,8699,Guangzhou", ), ("MacBook Pro,10,9999,Beijing", ), ("AirPods Pro,50,1999,Beijing", ), ("MacBook Pro,10,11499,Shanghai", ), ("iPhone 11,30,5999,Shanghai", ), ("iPhone 11 Pro,20,9999,Shenzhen", ), ("MacBook Pro,10,13899,Hangzhou", ), ("iPhone 11,10,6799,Beijing", ), ("MacBook Pro,10,18999,Beijing", ), ("iPhone 11 Pro,10,11799,Shenzhen", ), ("MacBook Pro,10,22199,Shanghai", ), ("AirPods Pro,40,1999,Shanghai", ) ] t_env.from_elements(data, ["line"]) \ .select("split(line) as str_array") \ .select("get(str_array, 3) as city, " "get(str_array, 1).cast(LONG) as count, " "get(str_array, 2).cast(LONG) as unit_price") \ .select("city, count, count * unit_price as total_price") \ .group_by("city") \ .select("city, " "sum(count) as sales_volume, " "sum(total_price) as sales") \ .insert_into("Results") t_env.execute("word_count")
st_env.get_config().get_configuration().set_string( "pipeline.jars", "file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka-base_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-jdbc_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-sql-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/kafka-clients-2.1.0.jar" ) #读kafka properties = { "zookeeper.connect": "nn1.hadoop:2181,nn2.hadoop:2181,s1.hadoop:2181", "bootstrap.servers": "nn1.hadoop:9092,nn2.hadoop:9092,s1.hadoop:9092", "group.id": "testGroup" } st_env.connect(Kafka().properties(properties).version("universal").topic("test").start_from_latest()) \ .with_format(Json()).with_schema(Schema() \ .field('throughputReqMax', DataTypes.BIGINT()) \ .field('throughputReqTotal', DataTypes.BIGINT())) \ .create_temporary_table('mySource') #写入csv st_env.connect(FileSystem().path('/usr/local/flink/test/result3.txt')) \ .with_format(OldCsv() .field('sub', DataTypes.BIGINT())) \ .with_schema(Schema() .field('sub', DataTypes.BIGINT())) \ .create_temporary_table('mySink') #读取kafka数据中的a和b字段相加再乘以2 , 并插入sink st_env.from_path('mySource')\ .select("(throughputReqTotal-throughputReqMax)") \ .insert_into('mySink') st_env.execute("job_test")
exec_env.set_parallelism(1) t_config = TableConfig() t_env = StreamTableEnvironment.create(exec_env, t_config) t_env.connect(Kafka() .version("0.11") .topic("test") .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .in_append_mode() \ .with_format(Csv() .line_delimiter("\r\n") \ .derive_schema()) \ .with_schema(Schema() .field("tbd", DataTypes.INT())) \ .register_table_source('mySource') t_env.connect(FileSystem().path('../production_data/kafkaoutput')) \ .with_format(OldCsv() .field('tbd', DataTypes.INT())) \ .with_schema(Schema() .field("tbd", DataTypes.INT())) \ .register_table_sink('mySink') t_env.scan('mySource') \ .select('tbd') \ .where("tbd = 1") \ .insert_into('mySink') t_env.execute("tutorial_job")
env.set_parallelism(1) t_env = StreamTableEnvironment.create(env) @udf(input_types=[DataTypes.DECIMAL(38, 12, nullable=True)], result_type=DataTypes.DECIMAL(38, 12, nullable=True)) def myadd(i): return i * i * 2 # add = udf(myadd, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT()) t_env.register_function("add", myadd) t_env.connect(FileSystem().path('/tmp/input')) \ .with_format(OldCsv() .field('b', DataTypes.DECIMAL(38,12,nullable=True))) \ .with_schema(Schema() .field('b', DataTypes.DECIMAL(38,12,nullable=True))) \ .create_temporary_table('mySource') t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field('sum', DataTypes.DECIMAL(38,12,nullable=True))) \ .with_schema(Schema() .field('sum', DataTypes.DECIMAL(38,12,nullable=True))) \ .create_temporary_table('mySink') t_env.from_path('mySource')\ .select("add(b)") \ .insert_into('mySink')
# 路径可以是绝对路径或相对路径,但注意路径前面不需要 file:// t_env.set_python_requirements(dir_requirements, dir_cache) else: # 方式 2:指定描述依赖的依赖文件 requirements.txt,作业运行时下载,不推荐。 t_env.set_python_requirements(dir_requirements) # ########################### 创建源表(source) ########################### dir_log = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'syslog.text') # 基于 Table API 创建 source 表 # source 表只有 1 列,名为 line t_env.connect(FileSystem().path(dir_log)) \ .with_format(OldCsv() .line_delimiter('\n') .field('line', DataTypes.STRING())) \ .with_schema(Schema() .field('line', DataTypes.STRING())) \ .create_temporary_table('source') # ########################### 创建结果表(sink) ########################### dir_result = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'result.csv') # 如果文件/文件夹存在,则删除 if os.path.exists(dir_result): if os.path.isfile(dir_result): os.remove(dir_result) else:
t_env = StreamTableEnvironment.create(env) # register function t_env.register_function("add1", add1) t_env.register_function("add2", add2) t_env.register_function("add3", add3) t_env.register_function("add4", add4) t = t_env.from_elements([(1, 2, 'Welcome'), (2, 3, 'To'), (3, 4, 'PyFlink')], ['a', 'b', 'c']) t_env.connect(FileSystem().path(sink_path))\ .with_format(OldCsv() .field_delimiter(',') .field("add1", DataTypes.BIGINT()) .field("add2", DataTypes.BIGINT()) .field("add3", DataTypes.BIGINT()) .field("add4", DataTypes.BIGINT()) .field("b", DataTypes.BIGINT()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("add1", DataTypes.BIGINT()) .field("add2", DataTypes.BIGINT()) .field("add3", DataTypes.BIGINT()) .field("add4", DataTypes.BIGINT()) .field("b", DataTypes.BIGINT()) .field("c", DataTypes.STRING()))\ .register_table_sink("pyflink_sink") t.select("add1(a, b), add2(a, b), add3(a, b), add4(a, b), b, c").insert_into( "pyflink_sink")
import pyflink as fk from pyflink.dataset import ExecutionEnvironment from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment from pyflink.table.descriptors import Schema, OldCsv, FileSystem exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # 连接器输入 t_env.connect(FileSystem().path("./data/in")).with_format(OldCsv().field("word",DataTypes.STRING()))\ .with_schema(Schema().field('word',DataTypes.STRING()))\ .create_temporary_table('mySource') t_env.connect(FileSystem().path("./data/out"))\ .with_format(OldCsv().field_delimiter('\t').field('word',DataTypes.STRING()).field('count',DataTypes.BIGINT()))\ .with_schema(Schema().field('word',DataTypes.STRING()).field('count',DataTypes.BIGINT()))\ .create_temporary_table("mySink") my_source_ddl = """ create table mySource ( word VARCHAR ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = './data/in/in.csv' ) """ my_sink_ddl = """ create table mySink ( word VARCHAR, `count` BIGINT
from pyflink.datastream import StreamExecutionEnvironment from pyflink.table import StreamTableEnvironment, DataTypes from pyflink.table.descriptors import Schema, OldCsv, FileSystem from pyflink.table.udf import udf env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(env) add = udf(lambda i, j: i + j, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT()) t_env.register_function("add", add) t_env.connect(FileSystem().path('/opt/examples/data/udf_add_input')) \ .with_format(OldCsv() .field('a', DataTypes.BIGINT()) .field('b', DataTypes.BIGINT())) \ .with_schema(Schema() .field('a', DataTypes.BIGINT()) .field('b', DataTypes.BIGINT())) \ .create_temporary_table('mySource') t_env.connect(FileSystem().path('/opt/examples/data/udf_add_output')) \ .with_format(OldCsv() .field('sum', DataTypes.BIGINT())) \ .with_schema(Schema() .field('sum', DataTypes.BIGINT())) \ .create_temporary_table('mySink') t_env.from_path('mySource')\ .select("add(a, b)") \ .insert_into('mySink')
import tempfile from pyflink.dataset import ExecutionEnvironment from pyflink.table import BatchTableEnvironment, TableConfig, WriteMode from pyflink.table.descriptors import FileSystem, OldCsv, Schema from pyflink.table.types import DataTypes t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = BatchTableEnvironment.create(env, t_config) source_file = '/notebooks/big-text.txt' sink_file = '/notebooks/sink.csv' t_env.connect(FileSystem().path(source_file)).with_format( OldCsv().line_delimiter('\n').field( 'word', DataTypes.STRING())).with_schema(Schema().field( 'word', DataTypes.STRING())).register_table_source('mySource') t_env.connect(FileSystem().path(sink_file)).with_format( OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field( 'count', DataTypes.BIGINT())).with_schema(Schema().field( 'word', DataTypes.STRING()).field( 'count', DataTypes.BIGINT())).register_table_sink('mySink') t_env.scan('mySource').group_by('word').select('word, count(1)').insert_into( 'mySink') t_env.execute('wordcount')
exec_env = StreamExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = StreamTableEnvironment.create(exec_env, t_config) t_env.connect(Kafka() .version("0.11") .topic("test") .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .in_append_mode() \ .with_format(Csv() .line_delimiter("\r\n") \ .derive_schema()) \ .with_schema(Schema() .field("tbd", DataTypes.STRING())) \ .register_table_source('mySource') t_env.connect(FileSystem().path('../production_data/kafkaoutput')) \ .with_format(OldCsv() .field('tbd', DataTypes.STRING())) \ .with_schema(Schema() .field("tbd", DataTypes.STRING())) \ .register_table_sink('mySink') t_env.scan('mySource') \ .insert_into('mySink') t_env.execute("tutorial_job")
exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) input_path = './input/data.json' output_path = './output/output.txt' if os.path.exists(output_path): os.remove(output_path) wpre.wcpretreatment(input_path) input_path = './input' t_env.connect(FileSystem().path(input_path)) \ .with_format(OldCsv() .field('dateRep', DataTypes.DATE(True))) \ .with_schema(Schema() .field('dateRep', DataTypes.DATE(True))) \ .create_temporary_table('mySource') t_env.connect(FileSystem().path(output_path)) \ .with_format(OldCsv() .field_delimiter('\t') .field('dateRep', DataTypes.DATE(True)) .field('deaths_weekly', DataTypes.BIGINT())) \ .with_schema(Schema() .field('dateRep', DataTypes.DATE(True)) .field('deaths_weekly', DataTypes.BIGINT())) \ .create_temporary_table('mySink') t_env.from_path('mySource') \
env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create( env, environment_settings = EnvironmentSettings.new_instance() .use_blink_planner() .build(), ) result_path = '/notebooks/output.csv' print('Results directory:', result_path) t_env.connect(FileSystem().path(result_path)).with_format( OldCsv() .field_delimiter(',') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT()) ).with_schema( Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT()) ).register_table_sink( 'Results' ) elements = [(word, 1) for word in content.split(' ')] t_env.from_elements(elements, ['word', 'count']).group_by('word').select( 'word, count(1) as count' ).insert_into('Results')
sentences = [string] x = np.zeros((len(sentences), maxlen)) for i, sentence in enumerate(sentences): for no, k in enumerate(sentence.split()[:maxlen][::-1]): x[i, -1 - no] = dic.get(k, UNK) indices = np.argmax(sess.run(Y, feed_dict={X: x}), axis=1) return label[indices[0]] t_env.set_python_requirements('/notebooks/requirements.txt') t_env.register_function('predict', predict) result_path = '/notebooks/output-tensorflow.csv' t_env.connect(FileSystem().path(result_path)).with_format( OldCsv().field_delimiter(',').field('sentence', DataTypes.STRING()).field( 'label', DataTypes.STRING())).with_schema(Schema().field( 'sentence', DataTypes.STRING()).field( 'label', DataTypes.STRING())).create_temporary_table('mySink') elements = [(sentence, ) for sentence in content.split('\n')] t_env.from_elements( elements, ['sentence']).select('sentence, predict(sentence)').insert_into('mySink') t_env.execute('predict')