def test_rowtime(self): schema = Schema()\ .field("int_field", DataTypes.INT())\ .field("long_field", DataTypes.BIGINT())\ .field("rtime", DataTypes.BIGINT())\ .rowtime( Rowtime().timestamps_from_field("long_field").watermarks_periodic_bounded(5000))\ .field("string_field", DataTypes.STRING()) properties = schema.to_properties() print(properties) expected = { 'schema.0.name': 'int_field', 'schema.0.data-type': 'INT', 'schema.1.name': 'long_field', 'schema.1.data-type': 'BIGINT', 'schema.2.name': 'rtime', 'schema.2.data-type': 'BIGINT', 'schema.2.rowtime.timestamps.type': 'from-field', 'schema.2.rowtime.timestamps.from': 'long_field', 'schema.2.rowtime.watermarks.type': 'periodic-bounded', 'schema.2.rowtime.watermarks.delay': '5000', 'schema.3.name': 'string_field', 'schema.3.data-type': 'VARCHAR(2147483647)' } self.assertEqual(expected, properties)
def main_flink(): #之前的步骤是拿文件然后预处理然后写文件到input这个文件中 env = StreamExecutionEnvironment.get_execution_environment() parr_num = 4 env.set_parallelism(parr_num) t_env = StreamTableEnvironment.create(env) @udf(input_types=DataTypes.STRING(), result_type=DataTypes.STRING()) def cut_extract(string): return cut_posseg.cut_extract(string) t_env.register_function("cut_extract",cut_extract) #这里是建表然后从input拿,问题1:有没有办法从自定义的list来当做输入来节省IO开销呢,比如我输入[文本A,文本B]这样的list作为输入 t_env.connect(FileSystem().path('/home/sjtu/input')) \ .with_format(OldCsv() .field('text', DataTypes.STRING())) \ .with_schema(Schema() .field('text', DataTypes.STRING())) \ .create_temporary_table('mySource') t_env.connect(FileSystem().path('/home/sjtu/output')) \ .with_format(OldCsv() .field('result', DataTypes.STRING())) \ .with_schema(Schema() .field('result', DataTypes.STRING())) \ .create_temporary_table('mySink') t_env.from_path('mySource')\ .select("cut_extract(text)")\ .insert_into('mySink') #问题2:这里我是将结果的表写了文件,但实际上我还需要在这个代码中继续对这些处理完的数据进行处理,也没有办法直接将上述的mySink表直接 #作为内存数据取出来而不是从硬盘再读入呢 t_env.execute("tutorial_job")
def demo01(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # StreamExecutionEnvironment t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\input')) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') # 文件存在会报错 t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') tab = t_env.from_path('mySource') tab.group_by(tab.word) \ .select(tab.word, lit(1).count) \ .execute_insert('mySink').wait()
def test_temporary_tables(self): t_env = self.t_env t_env.connect(FileSystem().path(os.path.join(self.tempdir + '/temp_1.csv'))) \ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING())) \ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING())) \ .create_temporary_table("temporary_table_1") t_env.connect(FileSystem().path(os.path.join(self.tempdir + '/temp_2.csv'))) \ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING())) \ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING())) \ .create_temporary_table("temporary_table_2") actual = t_env.list_temporary_tables() expected = ['temporary_table_1', 'temporary_table_2'] self.assert_equals(actual, expected) t_env.drop_temporary_table("temporary_table_1") actual = t_env.list_temporary_tables() expected = ['temporary_table_2'] self.assert_equals(actual, expected)
def test_fields(self): fields = collections.OrderedDict([ ("int_field", DataTypes.INT()), ("long_field", DataTypes.BIGINT()), ("string_field", DataTypes.STRING()), ("timestamp_field", DataTypes.TIMESTAMP(3)), ("time_field", DataTypes.TIME()), ("date_field", DataTypes.DATE()), ("double_field", DataTypes.DOUBLE()), ("float_field", DataTypes.FLOAT()), ("byte_field", DataTypes.TINYINT()), ("short_field", DataTypes.SMALLINT()), ("boolean_field", DataTypes.BOOLEAN()) ]) schema = Schema().fields(fields) properties = schema.to_properties() expected = { 'schema.0.name': 'int_field', 'schema.0.data-type': 'INT', 'schema.1.name': 'long_field', 'schema.1.data-type': 'BIGINT', 'schema.2.name': 'string_field', 'schema.2.data-type': 'VARCHAR(2147483647)', 'schema.3.name': 'timestamp_field', 'schema.3.data-type': 'TIMESTAMP(3)', 'schema.4.name': 'time_field', 'schema.4.data-type': 'TIME(0)', 'schema.5.name': 'date_field', 'schema.5.data-type': 'DATE', 'schema.6.name': 'double_field', 'schema.6.data-type': 'DOUBLE', 'schema.7.name': 'float_field', 'schema.7.data-type': 'FLOAT', 'schema.8.name': 'byte_field', 'schema.8.data-type': 'TINYINT', 'schema.9.name': 'short_field', 'schema.9.data-type': 'SMALLINT', 'schema.10.name': 'boolean_field', 'schema.10.data-type': 'BOOLEAN' } self.assertEqual(expected, properties) if sys.version_info[:2] <= (3, 5): fields = { "int_field": DataTypes.INT(), "long_field": DataTypes.BIGINT(), "string_field": DataTypes.STRING(), "timestamp_field": DataTypes.TIMESTAMP(3), "time_field": DataTypes.TIME(), "date_field": DataTypes.DATE(), "double_field": DataTypes.DOUBLE(), "float_field": DataTypes.FLOAT(), "byte_field": DataTypes.TINYINT(), "short_field": DataTypes.SMALLINT(), "boolean_field": DataTypes.BOOLEAN() } self.assertRaises(TypeError, Schema().fields, fields)
def test_schema(self): table_schema = TableSchema(["a", "b"], [DataTypes.INT(), DataTypes.STRING()]) schema = Schema().schema(table_schema) properties = schema.to_properties() expected = {'schema.0.name': 'a', 'schema.0.data-type': 'INT', 'schema.1.name': 'b', 'schema.1.data-type': 'VARCHAR(2147483647)'} self.assertEqual(expected, properties)
def test_proctime(self): schema = Schema()\ .field("int_field", DataTypes.INT())\ .field("ptime", DataTypes.BIGINT()).proctime()\ .field("string_field", DataTypes.STRING()) properties = schema.to_properties() expected = {'schema.0.name': 'int_field', 'schema.0.data-type': 'INT', 'schema.1.name': 'ptime', 'schema.1.data-type': 'BIGINT', 'schema.1.proctime': 'true', 'schema.2.name': 'string_field', 'schema.2.data-type': 'VARCHAR(2147483647)'} self.assertEqual(expected, properties)
def test_from_origin_field(self): schema = Schema()\ .field("int_field", DataTypes.INT())\ .field("long_field", DataTypes.BIGINT()).from_origin_field("origin_field_a")\ .field("string_field", DataTypes.STRING()) properties = schema.to_properties() expected = {'schema.0.name': 'int_field', 'schema.0.data-type': 'INT', 'schema.1.name': 'long_field', 'schema.1.data-type': 'BIGINT', 'schema.1.from': 'origin_field_a', 'schema.2.name': 'string_field', 'schema.2.data-type': 'VARCHAR(2147483647)'} self.assertEqual(expected, properties)
def register_rides_source(st_env): st_env \ .connect( # declare the external system to connect to Kafka() .version("universal") .topic("Rides") .start_from_earliest() .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092")) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("rideId", DataTypes.BIGINT()), DataTypes.FIELD("isStart", DataTypes.BOOLEAN()), DataTypes.FIELD("eventTime", DataTypes.STRING()), DataTypes.FIELD("lon", DataTypes.FLOAT()), DataTypes.FIELD("lat", DataTypes.FLOAT()), DataTypes.FIELD("psgCnt", DataTypes.INT()), DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \ .with_schema( # declare the schema of the table Schema() .field("rideId", DataTypes.BIGINT()) .field("taxiId", DataTypes.BIGINT()) .field("isStart", DataTypes.BOOLEAN()) .field("lon", DataTypes.FLOAT()) .field("lat", DataTypes.FLOAT()) .field("psgCnt", DataTypes.INT()) .field("eventTime", DataTypes.STRING())) \ .in_append_mode() \ .create_temporary_table("source")
def _local_execute_func(exec_func, write_func, pickle_func, python_path): table_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).use_blink_planner().in_batch_mode().build()) table_env.get_config().get_configuration().set_string( 'parallelism.default', '1') table_env.get_config().set_python_executable(python_path) table_env.register_function( exec_func, udf(lambda _: pickle_func, DataTypes.BIGINT(), DataTypes.STRING())) table_env.connect(FileSystem().path(write_func)) \ .with_format(OldCsv().field('func', DataTypes.STRING())) \ .with_schema(Schema().field('func', DataTypes.STRING())) \ .create_temporary_table(exec_func) table = table_env.from_elements([(1, 'Joblib')]) table.select('{}(_1)'.format(exec_func)).insert_into(exec_func) table_env.execute(exec_func) # decode execution result from table sink file. execute_result = cloudpickle.loads( codecs.decode( pd.DataFrame(pd.read_csv(write_func))[0:].columns[0].encode(), 'base64')) # remove table sink file to clear ineffective files. os.remove(write_func) return execute_result
def register_rides_source(st_env): st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("Rides") .start_from_earliest() .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092")) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("rideId", DataTypes.BIGINT()), DataTypes.FIELD("isStart", DataTypes.BOOLEAN()), DataTypes.FIELD("eventTime", DataTypes.TIMESTAMP()), DataTypes.FIELD("lon", DataTypes.FLOAT()), DataTypes.FIELD("lat", DataTypes.FLOAT()), DataTypes.FIELD("psgCnt", DataTypes.INT()), DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \ .with_schema( # declare the schema of the table Schema() .field("rideId", DataTypes.BIGINT()) .field("taxiId", DataTypes.BIGINT()) .field("isStart", DataTypes.BOOLEAN()) .field("lon", DataTypes.FLOAT()) .field("lat", DataTypes.FLOAT()) .field("psgCnt", DataTypes.INT()) .field("rideTime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("eventTime") .watermarks_periodic_bounded(60000))) \ .in_append_mode() \ .register_table_source("source")
def register_transactions_source(st_env): st_env.connect(Kafka() .version("universal") .topic("transactions-data") .start_from_latest() .property("zookeeper.connect", "host.docker.internal:2181") .property("bootstrap.servers", "host.docker.internal:19091")) \ .with_format(Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("customer", DataTypes.STRING()), DataTypes.FIELD("transaction_type", DataTypes.STRING()), DataTypes.FIELD("online_payment_amount", DataTypes.DOUBLE()), DataTypes.FIELD("in_store_payment_amount", DataTypes.DOUBLE()), DataTypes.FIELD("lat", DataTypes.DOUBLE()), DataTypes.FIELD("lon", DataTypes.DOUBLE()), DataTypes.FIELD("transaction_datetime", DataTypes.TIMESTAMP())]))) \ .with_schema(Schema() .field("customer", DataTypes.STRING()) .field("transaction_type", DataTypes.STRING()) .field("online_payment_amount", DataTypes.DOUBLE()) .field("in_store_payment_amount", DataTypes.DOUBLE()) .field("lat", DataTypes.DOUBLE()) .field("lon", DataTypes.DOUBLE()) .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("transaction_datetime") .watermarks_periodic_bounded(60000))) \ .in_append_mode() \ .register_table_source("source")
def execute(self, function_context: FlinkFunctionContext, input_list: List[Table]) -> List[Table]: t_env = function_context.get_table_env() statement_set = function_context.get_statement_set() table = input_list[0] Popen('rm -rf /root/debug', shell=True) t_env.register_function( "build_index", udf(BuildIndexUDF(self.path, self.element_type, self.dimension), [DataTypes.STRING(), DataTypes.STRING()], DataTypes.STRING())) dummy_output_path = '/tmp/indexed_key' if os.path.exists(dummy_output_path): if os.path.isdir(dummy_output_path): shutil.rmtree(dummy_output_path) else: os.remove(dummy_output_path) t_env.connect(FileSystem().path(dummy_output_path)) \ .with_format(OldCsv() .field('key', DataTypes.STRING())) \ .with_schema(Schema() .field('key', DataTypes.STRING())) \ .create_temporary_table('train_sink') statement_set.add_insert( "train_sink", table.select("build_index(uuid, feature_data)")) return []
def register_sink(st_env, index_name): st_env \ .connect( Elasticsearch() .version("7") .host("localhost", 9200, "http") .index(index_name) .document_type('pyflink') .key_delimiter("_") .key_null_literal("null") .failure_handler_ignore() .disable_flush_on_checkpoint() .bulk_flush_max_actions(42) .bulk_flush_max_size("42 mb") .bulk_flush_interval(3000) .bulk_flush_backoff_constant() .bulk_flush_backoff_max_retries(3) .bulk_flush_backoff_delay(3000) .connection_max_retry_timeout(3)) \ .with_schema( Schema() .field("a", DataTypes.INT())) \ .with_format( Json() .schema(DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT())]))) \ .in_upsert_mode() \ .create_temporary_table("sink")
def register_rides_sink(st_env): st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("TempResults") .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092")) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("rideId", DataTypes.BIGINT()), DataTypes.FIELD("taxiId", DataTypes.BIGINT()), DataTypes.FIELD("isStart", DataTypes.BOOLEAN()), DataTypes.FIELD("lon", DataTypes.FLOAT()), DataTypes.FIELD("lat", DataTypes.FLOAT()), DataTypes.FIELD("psgCnt", DataTypes.INT()), DataTypes.FIELD("rideTime", DataTypes.TIMESTAMP()) ]))) \ .with_schema( # declare the schema of the table Schema() .field("rideId", DataTypes.BIGINT()) .field("taxiId", DataTypes.BIGINT()) .field("isStart", DataTypes.BOOLEAN()) .field("lon", DataTypes.FLOAT()) .field("lat", DataTypes.FLOAT()) .field("psgCnt", DataTypes.INT()) .field("rideTime", DataTypes.TIMESTAMP())) \ .in_append_mode() \ .register_table_sink("sink")
def test_table_from_descriptor(self): from pyflink.table.schema import Schema schema = Schema.new_builder().column("f0", DataTypes.INT()).build() descriptor = TableDescriptor.for_connector("fake").schema( schema).build() table = self.t_env.from_descriptor(descriptor) self.assertEqual( schema, Schema(Schema.new_builder()._j_builder.fromResolvedSchema( table._j_table.getResolvedSchema()).build())) table = CatalogBaseTable( self.t_env._j_tenv.getCatalogManager().getTable( table._j_table.getQueryOperation().getTableIdentifier()).get( ).getTable()) self.assertEqual("fake", table.get_options().get("connector"))
def test_register_table_source_and_register_table_sink(self): self.env.set_parallelism(1) source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] self.prepare_csv_source(source_path, data, field_types, field_names) sink_path = os.path.join(self.tempdir + '/streaming2.csv') if os.path.isfile(sink_path): os.remove(sink_path) t_env = self.t_env # register_table_source t_env.connect(FileSystem().path(source_path))\ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .register_table_source("source") # register_table_sink t_env.connect(FileSystem().path(sink_path))\ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .register_table_sink("sink") t_env.scan("source") \ .select("a + 1, b, c") \ .insert_into("sink") self.t_env.execute("test") with open(sink_path, 'r') as f: lines = f.read() assert lines == '2,Hi,Hello\n' + '3,Hello,Hello\n'
def process(self, execution_context: flink.ExecutionContext, input_list: List[Table] = None) -> List[Table]: input_file = os.path.join(os.getcwd(), 'resources', 'word_count.txt') t_env = execution_context.table_env t_env.connect(FileSystem().path(input_file)) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') return [t_env.from_path('mySource')]
def execute(self, function_context: FlinkFunctionContext) -> Table: example_meta: af.ExampleMeta = function_context.get_example_meta() t_env = function_context.get_table_env() t_env.connect(FileSystem().path(example_meta.batch_uri)) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') return t_env.from_path('mySource')
def group_by_agg_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" st_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) st_env.connect( Elasticsearch() .version("6") .host("localhost", 9200, "http") .index("group_by_agg_streaming") .document_type('pyflink') .key_delimiter("_") .key_null_literal("null") .failure_handler_ignore() .disable_flush_on_checkpoint() .bulk_flush_max_actions(2) .bulk_flush_max_size("1 mb") .bulk_flush_interval(5000) ) \ .with_schema( Schema() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) ) \ .with_format( Json() .derive_schema() ) \ .in_upsert_mode() \ .register_table_sink("result") orders = st_env.scan("Orders") groub_by_table = orders.group_by("a").select("a, b.sum as d") # Because the schema of index user in elasticsearch is # {"a":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}, # "b":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}} # so we need to cast the type in our demo. st_env.register_table("group_table", groub_by_table) result = st_env.sql_query("SELECT a, CAST(d AS VARCHAR) from group_table") result.insert_into("result") st_env.execute("group by agg streaming")
def test_register_temporary_table(self): self.t_env.get_config().get_configuration().set_string( "parallelism.default", "1") source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] self.prepare_csv_source(source_path, data, field_types, field_names) sink_path = os.path.join(self.tempdir + '/streaming2.csv') if os.path.isfile(sink_path): os.remove(sink_path) t_env = self.t_env t_env.connect(FileSystem().path(source_path))\ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .create_temporary_table("source") t_env.connect(FileSystem().path(sink_path))\ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .create_temporary_table("sink") t_env.from_path("source").select("a + 1, b, c").execute_insert( "sink").wait() with open(sink_path, 'r') as f: lines = f.read() assert lines == '2,Hi,Hello\n' + "3,Hello,Hello\n"
def test_field_in_string(self): schema = Schema()\ .field("int_field", 'INT')\ .field("long_field", 'BIGINT')\ .field("string_field", 'VARCHAR')\ .field("timestamp_field", 'SQL_TIMESTAMP')\ .field("time_field", 'SQL_TIME')\ .field("date_field", 'SQL_DATE')\ .field("double_field", 'DOUBLE')\ .field("float_field", 'FLOAT')\ .field("byte_field", 'TINYINT')\ .field("short_field", 'SMALLINT')\ .field("boolean_field", 'BOOLEAN') properties = schema.to_properties() expected = { 'schema.0.name': 'int_field', 'schema.0.data-type': 'INT', 'schema.1.name': 'long_field', 'schema.1.data-type': 'BIGINT', 'schema.2.name': 'string_field', 'schema.2.data-type': 'VARCHAR', 'schema.3.name': 'timestamp_field', 'schema.3.data-type': 'TIMESTAMP(3)', 'schema.4.name': 'time_field', 'schema.4.data-type': 'TIME(0)', 'schema.5.name': 'date_field', 'schema.5.data-type': 'DATE', 'schema.6.name': 'double_field', 'schema.6.data-type': 'DOUBLE', 'schema.7.name': 'float_field', 'schema.7.data-type': 'FLOAT', 'schema.8.name': 'byte_field', 'schema.8.data-type': 'TINYINT', 'schema.9.name': 'short_field', 'schema.9.data-type': 'SMALLINT', 'schema.10.name': 'boolean_field', 'schema.10.data-type': 'BOOLEAN' } self.assertEqual(expected, properties)
def test_field(self): schema = Schema()\ .field("int_field", DataTypes.INT())\ .field("long_field", DataTypes.BIGINT())\ .field("string_field", DataTypes.STRING())\ .field("timestamp_field", DataTypes.TIMESTAMP(3))\ .field("time_field", DataTypes.TIME())\ .field("date_field", DataTypes.DATE())\ .field("double_field", DataTypes.DOUBLE())\ .field("float_field", DataTypes.FLOAT())\ .field("byte_field", DataTypes.TINYINT())\ .field("short_field", DataTypes.SMALLINT())\ .field("boolean_field", DataTypes.BOOLEAN()) properties = schema.to_properties() expected = { 'schema.0.name': 'int_field', 'schema.0.data-type': 'INT', 'schema.1.name': 'long_field', 'schema.1.data-type': 'BIGINT', 'schema.2.name': 'string_field', 'schema.2.data-type': 'VARCHAR(2147483647)', 'schema.3.name': 'timestamp_field', 'schema.3.data-type': 'TIMESTAMP(3)', 'schema.4.name': 'time_field', 'schema.4.data-type': 'TIME(0)', 'schema.5.name': 'date_field', 'schema.5.data-type': 'DATE', 'schema.6.name': 'double_field', 'schema.6.data-type': 'DOUBLE', 'schema.7.name': 'float_field', 'schema.7.data-type': 'FLOAT', 'schema.8.name': 'byte_field', 'schema.8.data-type': 'TINYINT', 'schema.9.name': 'short_field', 'schema.9.data-type': 'SMALLINT', 'schema.10.name': 'boolean_field', 'schema.10.data-type': 'BOOLEAN' } self.assertEqual(expected, properties)
def word_count(): result = wikipedia.page("New York City") content = result.summary t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) print(add.add(10,5)) print("Word Count"); # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) #sink_ddl = """ # create table Results( # word VARCHAR, # `count` BIGINT # ) with ( # 'connector.type' = 'filesystem', # 'format.type' = 'csv', # 'connector.path' = '{}' # ) # """.format(result_path) t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('Results') #t_env.sql_update(sink_ddl) elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute("word_count")
def test_with_schema(self): descriptor = self.t_env.connect(FileSystem()) descriptor = descriptor.with_format(OldCsv()).with_schema(Schema().field("a", "INT")) properties = descriptor.to_properties() expected = {'schema.0.name': 'a', 'schema.0.data-type': 'INT', 'format.type': 'csv', 'format.property-version': '1', 'connector.type': 'filesystem', 'connector.property-version': '1'} assert properties == expected
def register_transactions_es_sink(st_env): st_env.connect(Elasticsearch() .version("7") .host("localhost", 9200, "http") .index("account-activity") ) \ .with_schema(Schema() .field("event_id", DataTypes.STRING()) .field("account_id", DataTypes.DOUBLE()) .field("event_type", DataTypes.STRING()) .field("location_country", DataTypes.STRING()) .field("event_timestamp", DataTypes.TIMESTAMP(precision=3))) \ .with_format(Json().derive_schema()).in_upsert_mode().create_temporary_table("sink_elasticsearch")
def run(self): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = StreamTableEnvironment.create(exec_env, t_config) t_env.connect(FileSystem().path('/tmp/input')) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') model = Model.fromFile('./../batch_ml/model.pmml') t_env.from_path('mySource') \ .group_by('word') \ .select('word, count(1)') \ .insert_into('mySink') t_env.execute("tutorial_job") self.read_data() result = model.predict({ "Sepal_Length": 5.1, "Sepal_Width": 3.5, "Petal_Length": 1.4, "Petal_Width": 0.2 })
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("len", DataTypes.INT()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("len", DataTypes.INT()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") t_env.register_java_function("len", "org.apache.flink.udf.UDFLength") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, len(word), count(1) as count") \ .insert_into("Results") t_env.execute("word_count")
def register_transactions_es_sink(st_env): st_env.connect(Elasticsearch() .version("6") .host("0.0.0.0", 9200, "http") .index("transactions-supermarket-case") .document_type("usage")) \ .with_schema(Schema() .field("customer", DataTypes.STRING()) .field("count_transactions", DataTypes.STRING()) .field("total_online_payment_amount", DataTypes.DOUBLE()) .field('total_in_store_payment_amount', DataTypes.DOUBLE()) .field("lon", DataTypes.FLOAT()) .field("lat", DataTypes.FLOAT()) .field('last_transaction_time', DataTypes.STRING()) ) \ .with_format(Json().derive_schema()).in_upsert_mode().register_table_sink("sink_elasticsearch")
def register_cnt_sink(st_env): st_env.connect( Elasticsearch() .version("6") .host("elasticsearch", 9200, "http") .index("area-cnts") .document_type('areacnt') .key_delimiter("$")) \ .with_schema( Schema() .field("???", ???) .field("???", DataTypes.BIGINT())) \ .with_format( Json() .derive_schema()) \ .in_upsert_mode() \ .register_table_sink("sink")