示例#1
0
    def test_with_schema(self):
        descriptor = self.t_env.connect(FileSystem())

        descriptor = descriptor.with_format(OldCsv()).with_schema(
            Schema().field("a", "INT"))

        properties = descriptor.to_properties()
        expected = {
            'schema.0.name': 'a',
            'schema.0.data-type': 'INT',
            'format.type': 'csv',
            'format.property-version': '1',
            'connector.type': 'filesystem',
            'connector.property-version': '1'
        }
        assert properties == expected
示例#2
0
    def test_in_append_mode(self):
        descriptor = self.t_env.connect(FileSystem())

        descriptor = descriptor\
            .with_format(OldCsv())\
            .in_append_mode()

        properties = descriptor.to_properties()
        expected = {
            'update-mode': 'append',
            'format.type': 'csv',
            'format.property-version': '1',
            'connector.property-version': '1',
            'connector.type': 'filesystem'
        }
        assert properties == expected
示例#3
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename, e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    elements = [(word, 1) for word in content.split(" ")]

    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute("word_count")
示例#4
0
    def test_field(self):
        csv = OldCsv()

        csv.field("a", DataTypes.BIGINT())
        csv.field("b", DataTypes.STRING())
        csv.field("c", "SQL_TIMESTAMP")

        properties = csv.to_properties()
        expected = {'format.fields.0.name': 'a',
                    'format.fields.0.data-type': 'BIGINT',
                    'format.fields.1.name': 'b',
                    'format.fields.1.data-type': 'VARCHAR(2147483647)',
                    'format.fields.2.name': 'c',
                    'format.fields.2.data-type': 'TIMESTAMP(3)',
                    'format.type': 'csv',
                    'format.property-version': '1'}
        self.assertEqual(expected, properties)
示例#5
0
    def test_schema(self):
        csv = OldCsv()
        schema = TableSchema(["a", "b"], [DataTypes.INT(), DataTypes.STRING()])

        csv = csv.schema(schema)

        properties = csv.to_properties()
        expected = {
            'format.fields.0.name': 'a',
            'format.fields.0.data-type': 'INT',
            'format.fields.1.name': 'b',
            'format.fields.1.data-type': 'VARCHAR(2147483647)',
            'format.type': 'csv',
            'format.property-version': '1'
        }

        self.assertEqual(expected, properties)
示例#6
0
    def execute(self, function_context: FlinkFunctionContext, input_table: Table) -> None:
        example_meta: af.ExampleMeta = function_context.get_example_meta()
        output_file = example_meta.batch_uri
        if os.path.exists(output_file):
            os.remove(output_file)

        t_env = function_context.get_table_env()
        statement_set = function_context.get_statement_set()
        t_env.connect(FileSystem().path(output_file)) \
            .with_format(OldCsv()
                         .field_delimiter('\t')
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .with_schema(Schema()
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .create_temporary_table('mySink')
        statement_set.add_insert('mySink', input_table)
示例#7
0
    def process(self, execution_context: flink.ExecutionContext, input_list: List[Table] = None) -> List[Table]:
        output_file = os.path.join(os.getcwd(), 'output')
        if os.path.exists(output_file):
            os.remove(output_file)

        t_env = execution_context.table_env
        statement_set = execution_context.statement_set
        t_env.connect(FileSystem().path(output_file)) \
            .with_format(OldCsv()
                         .field_delimiter('\t')
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .with_schema(Schema()
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .create_temporary_table('mySink')
        statement_set.add_insert('mySink', input_list[0])
        return []
def word_count():
    f1 = open("/home/mnm/flink-1.9.1/1", "r")
    f2 = open("/home/mnm/flink-1.9.1/2", "r")
    f3 = open("/home/mnm/flink-1.9.1/3", "r")
    f4 = open("/home/mnm/flink-1.9.1/4", "r")
    f5 = open("/home/mnm/flink-1.9.1/5", "r")
    content = f1.read() + f2.read() + f3.read() + f4.read() + f5.read()

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute("Python batch word count")
示例#9
0
class main():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)

    # t_env.connect(FileSystem().path('./temp/deviceorientation')) \
    #     .with_format(OldCsv()
    #                 .field('word', DataTypes.STRING())) \
    #     .with_schema(Schema()
    #                 .field('word', DataTypes.STRING())) \
    #     .create_temporary_table('mySource')
    my_source_ddl = """
        create table mySource (
            word VARCHAR
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = './temp/input'
        )
    """
    t_env.sql_update(my_source_ddl)

    t_env.connect(FileSystem().path('/tmp/output')) \
        .with_format(OldCsv()
                    .field_delimiter('\t')
                    .field('word', DataTypes.STRING())
                    .field('count', DataTypes.BIGINT())) \
        .with_schema(Schema()
                    .field('word', DataTypes.STRING())
                    .field('count', DataTypes.BIGINT())) \
        .create_temporary_table('mySink')
    t_env.from_path('mySource') \
        .group_by('word') \
        .select('word, count(1)') \
        .insert_into('mySink')

    t_env.execute("tutorial_job")
示例#10
0
 def execute(self, function_context: FlinkFunctionContext,
             input_list: List[Table]) -> List[Table]:
     t_env = function_context.get_table_env()
     statement_set = function_context.get_statement_set()
     table = input_list[0]
     t_env.register_function(
         "build_index",
         udf(BuildIndexUDF(self.path, self.element_type, self.dimension),
             [DataTypes.STRING(), DataTypes.STRING()], DataTypes.STRING()))
     dummy_output_path = '/tmp/indexed_key'
     if os.path.exists(dummy_output_path):
         if os.path.isdir(dummy_output_path):
             shutil.rmtree(dummy_output_path)
         else:
             os.remove(dummy_output_path)
     t_env.connect(FileSystem().path(dummy_output_path)) \
         .with_format(OldCsv()
                      .field('key', DataTypes.STRING())) \
         .with_schema(Schema()
                      .field('key', DataTypes.STRING())) \
         .create_temporary_table('train_sink')
     statement_set.add_insert(
         "train_sink", table.select("build_index(uuid, feature_data)"))
     return []
    )
).with_schema(
    Schema()
    .field('datetime', DataTypes.STRING())
    .field('text', DataTypes.STRING())
).in_append_mode().register_table_source(
    'source'
)


result_path = '/notebooks/output-tensorflow.csv'

t_env.connect(FileSystem().path(result_path)).with_format(
    OldCsv()
    .field_delimiter(',')
    .field('datetime', DataTypes.STRING())
    .field('sentence', DataTypes.STRING())
    .field('label', DataTypes.STRING())
).with_schema(
    Schema()
    .field('datetime', DataTypes.STRING())
    .field('sentence', DataTypes.STRING())
    .field('label', DataTypes.STRING())
).in_append_mode().register_table_sink(
    'sink'
)

st_env.from_path('source').select(
    'datetime, sentence, predict(sentence)'
).insert_into('sink')
示例#12
0
table_config.set_null_check(False)

# Create table environment where the source and sink tables will be registered
table_env = StreamTableEnvironment.create(env, table_config)

from pyflink.table.descriptors import Kafka, Json, OldCsv, Schema, FileSystem

directories = ['/flink/lib']
for directory in directories:
    for jar in glob.glob(os.path.join(directory, '*.jar')):
        sys.path.append(jar)

# from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer11
# from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer09

OldCsv()
print("debug 010")

Kafka()
print("debug 020")
Json()
print("debug 030")


sourcetable = table_env \
    .connect(Kafka()
             .properties({'update-mode': 'append', 'connector.topic': 'machine.data',
                          'connector.properties.zookeeper.connect': 'localhost:2181',
                          'connector.properties.bootstrap.servers.': 'localhost:9092'})) \
    .with_format(Json().
                 json_schema(
示例#13
0
from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem

exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(2)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)

t_env.connect(FileSystem().path('input')) \
    .with_format(OldCsv()
                 .line_delimiter(' ')
                 .field('word', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())) \
    .register_table_source("inputSource")

t_env.connect(FileSystem().path('output')) \
    .with_format(OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\
    .with_schema(Schema().field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\
    .register_table_sink('sink')

t_env.scan('inputSource').group_by('word').select('word, count(1)').insert_into('sink')

t_env.execute('my first job')
示例#14
0
def word_count():
    environment_settings = EnvironmentSettings.new_instance().in_batch_mode(
    ).use_blink_planner().build()
    t_env = BatchTableEnvironment.create(
        environment_settings=environment_settings)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    # we should set the Python verison here if `Python` not point
    t_env.get_config().set_python_executable("python3")

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("city", DataTypes.STRING())
                     .field("sales_volume", DataTypes.BIGINT())
                     .field("sales", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("city", DataTypes.STRING())
                     .field("sales_volume", DataTypes.BIGINT())
                     .field("sales", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    @udf(input_types=DataTypes.STRING(),
         result_type=DataTypes.ARRAY(DataTypes.STRING()))
    def split(input_str: str):
        return input_str.split(",")

    @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()),
                      DataTypes.INT()],
         result_type=DataTypes.STRING())
    def get(arr, index):
        return arr[index]

    t_env.register_function("split", split)
    t_env.register_function("get", get)

    t_env.get_config().get_configuration().set_string("parallelism.default",
                                                      "1")

    data = [
        ("iPhone 11,30,5499,Beijing", ), ("iPhone 11 Pro,20,8699,Guangzhou", ),
        ("MacBook Pro,10,9999,Beijing", ), ("AirPods Pro,50,1999,Beijing", ),
        ("MacBook Pro,10,11499,Shanghai", ), ("iPhone 11,30,5999,Shanghai", ),
        ("iPhone 11 Pro,20,9999,Shenzhen", ),
        ("MacBook Pro,10,13899,Hangzhou", ), ("iPhone 11,10,6799,Beijing", ),
        ("MacBook Pro,10,18999,Beijing", ),
        ("iPhone 11 Pro,10,11799,Shenzhen", ),
        ("MacBook Pro,10,22199,Shanghai", ), ("AirPods Pro,40,1999,Shanghai", )
    ]
    t_env.from_elements(data, ["line"]) \
        .select("split(line) as str_array") \
        .select("get(str_array, 3) as city, "
                "get(str_array, 1).cast(LONG) as count, "
                "get(str_array, 2).cast(LONG) as unit_price") \
        .select("city, count, count * unit_price as total_price") \
        .group_by("city") \
        .select("city, "
                "sum(count) as sales_volume, "
                "sum(total_price) as sales") \
        .insert_into("Results")

    t_env.execute("word_count")
示例#15
0
st_env.get_config().get_configuration().set_string(
    "pipeline.jars",
    "file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka-base_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-jdbc_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-sql-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/kafka-clients-2.1.0.jar"
)

#读kafka
properties = {
    "zookeeper.connect": "nn1.hadoop:2181,nn2.hadoop:2181,s1.hadoop:2181",
    "bootstrap.servers": "nn1.hadoop:9092,nn2.hadoop:9092,s1.hadoop:9092",
    "group.id": "testGroup"
}
st_env.connect(Kafka().properties(properties).version("universal").topic("test").start_from_latest()) \
    .with_format(Json()).with_schema(Schema() \
        .field('throughputReqMax', DataTypes.BIGINT()) \
        .field('throughputReqTotal', DataTypes.BIGINT())) \
    .create_temporary_table('mySource')

#写入csv
st_env.connect(FileSystem().path('/usr/local/flink/test/result3.txt')) \
    .with_format(OldCsv()
                .field('sub', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('sub', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

#读取kafka数据中的a和b字段相加再乘以2 , 并插入sink
st_env.from_path('mySource')\
    .select("(throughputReqTotal-throughputReqMax)") \
    .insert_into('mySink')
st_env.execute("job_test")
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = StreamTableEnvironment.create(exec_env, t_config)

t_env.connect(Kafka()
              .version("0.11")
              .topic("test")
              .property("zookeeper.connect", "localhost:2181")
              .property("bootstrap.servers", "localhost:9092")
              ) \
    .in_append_mode() \
    .with_format(Csv()
                 .line_delimiter("\r\n")      \
                 .derive_schema()) \
    .with_schema(Schema()
                 .field("tbd", DataTypes.INT())) \
    .register_table_source('mySource')

t_env.connect(FileSystem().path('../production_data/kafkaoutput')) \
    .with_format(OldCsv()
                 .field('tbd', DataTypes.INT())) \
    .with_schema(Schema()
                 .field("tbd", DataTypes.INT())) \
    .register_table_sink('mySink')

t_env.scan('mySource') \
    .select('tbd') \
    .where("tbd = 1") \
    .insert_into('mySink')

t_env.execute("tutorial_job")
示例#17
0
env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)


@udf(input_types=[DataTypes.DECIMAL(38, 12, nullable=True)],
     result_type=DataTypes.DECIMAL(38, 12, nullable=True))
def myadd(i):
    return i * i * 2


# add = udf(myadd, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT())

t_env.register_function("add", myadd)

t_env.connect(FileSystem().path('/tmp/input')) \
    .with_format(OldCsv()
                 .field('b', DataTypes.DECIMAL(38,12,nullable=True))) \
    .with_schema(Schema()
                 .field('b', DataTypes.DECIMAL(38,12,nullable=True))) \
    .create_temporary_table('mySource')

t_env.connect(FileSystem().path('/tmp/output')) \
    .with_format(OldCsv()
                 .field('sum', DataTypes.DECIMAL(38,12,nullable=True))) \
    .with_schema(Schema()
                 .field('sum', DataTypes.DECIMAL(38,12,nullable=True))) \
    .create_temporary_table('mySink')

t_env.from_path('mySource')\
    .select("add(b)") \
    .insert_into('mySink')
示例#18
0
        # 路径可以是绝对路径或相对路径,但注意路径前面不需要 file://
        t_env.set_python_requirements(dir_requirements, dir_cache)
    else:
        # 方式 2:指定描述依赖的依赖文件 requirements.txt,作业运行时下载,不推荐。
        t_env.set_python_requirements(dir_requirements)

# ########################### 创建源表(source) ###########################

dir_log = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                       'syslog.text')

# 基于 Table API 创建 source 表
# source 表只有 1 列,名为 line
t_env.connect(FileSystem().path(dir_log)) \
    .with_format(OldCsv()
                 .line_delimiter('\n')
                 .field('line', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field('line', DataTypes.STRING())) \
    .create_temporary_table('source')

# ########################### 创建结果表(sink) ###########################

dir_result = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                          'result.csv')

# 如果文件/文件夹存在,则删除
if os.path.exists(dir_result):
    if os.path.isfile(dir_result):
        os.remove(dir_result)
    else:
示例#19
0
t_env = StreamTableEnvironment.create(env)

# register function
t_env.register_function("add1", add1)
t_env.register_function("add2", add2)
t_env.register_function("add3", add3)
t_env.register_function("add4", add4)

t = t_env.from_elements([(1, 2, 'Welcome'), (2, 3, 'To'), (3, 4, 'PyFlink')],
                        ['a', 'b', 'c'])

t_env.connect(FileSystem().path(sink_path))\
    .with_format(OldCsv()
        .field_delimiter(',')
        .field("add1", DataTypes.BIGINT())
        .field("add2", DataTypes.BIGINT())
        .field("add3", DataTypes.BIGINT())
        .field("add4", DataTypes.BIGINT())
        .field("b", DataTypes.BIGINT())
        .field("c", DataTypes.STRING()))\
    .with_schema(Schema()
        .field("add1", DataTypes.BIGINT())
        .field("add2", DataTypes.BIGINT())
        .field("add3", DataTypes.BIGINT())
        .field("add4", DataTypes.BIGINT())
        .field("b", DataTypes.BIGINT())
        .field("c", DataTypes.STRING()))\
    .register_table_sink("pyflink_sink")

t.select("add1(a, b), add2(a, b), add3(a, b), add4(a, b), b, c").insert_into(
    "pyflink_sink")
示例#20
0
import pyflink as fk
from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem

exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)
# 连接器输入
t_env.connect(FileSystem().path("./data/in")).with_format(OldCsv().field("word",DataTypes.STRING()))\
    .with_schema(Schema().field('word',DataTypes.STRING()))\
    .create_temporary_table('mySource')
t_env.connect(FileSystem().path("./data/out"))\
    .with_format(OldCsv().field_delimiter('\t').field('word',DataTypes.STRING()).field('count',DataTypes.BIGINT()))\
    .with_schema(Schema().field('word',DataTypes.STRING()).field('count',DataTypes.BIGINT()))\
    .create_temporary_table("mySink")
my_source_ddl = """
    create table mySource (
        word VARCHAR
    ) with (
        'connector.type' = 'filesystem',
        'format.type' = 'csv',
        'connector.path' = './data/in/in.csv'
    )
"""

my_sink_ddl = """
    create table mySink (
        word VARCHAR,
        `count` BIGINT
示例#21
0
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes
from pyflink.table.descriptors import Schema, OldCsv, FileSystem
from pyflink.table.udf import udf

env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)

add = udf(lambda i, j: i + j, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT())
t_env.register_function("add", add)

t_env.connect(FileSystem().path('/opt/examples/data/udf_add_input')) \
    .with_format(OldCsv()
                 .field('a', DataTypes.BIGINT())
                 .field('b', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('a', DataTypes.BIGINT())
                 .field('b', DataTypes.BIGINT())) \
    .create_temporary_table('mySource')

t_env.connect(FileSystem().path('/opt/examples/data/udf_add_output')) \
    .with_format(OldCsv()
                 .field('sum', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('sum', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

t_env.from_path('mySource')\
    .select("add(a, b)") \
    .insert_into('mySink')
示例#22
0
import tempfile

from pyflink.dataset import ExecutionEnvironment
from pyflink.table import BatchTableEnvironment, TableConfig, WriteMode
from pyflink.table.descriptors import FileSystem, OldCsv, Schema
from pyflink.table.types import DataTypes

t_config = TableConfig()
env = ExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = BatchTableEnvironment.create(env, t_config)

source_file = '/notebooks/big-text.txt'
sink_file = '/notebooks/sink.csv'

t_env.connect(FileSystem().path(source_file)).with_format(
    OldCsv().line_delimiter('\n').field(
        'word', DataTypes.STRING())).with_schema(Schema().field(
            'word', DataTypes.STRING())).register_table_source('mySource')

t_env.connect(FileSystem().path(sink_file)).with_format(
    OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field(
        'count', DataTypes.BIGINT())).with_schema(Schema().field(
            'word', DataTypes.STRING()).field(
                'count', DataTypes.BIGINT())).register_table_sink('mySink')

t_env.scan('mySource').group_by('word').select('word, count(1)').insert_into(
    'mySink')

t_env.execute('wordcount')
示例#23
0
exec_env = StreamExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = StreamTableEnvironment.create(exec_env, t_config)

t_env.connect(Kafka()
              .version("0.11")
              .topic("test")
              .property("zookeeper.connect", "localhost:2181")
              .property("bootstrap.servers", "localhost:9092")
              ) \
    .in_append_mode() \
    .with_format(Csv()
                 .line_delimiter("\r\n")      \
                 .derive_schema()) \
    .with_schema(Schema()
                 .field("tbd", DataTypes.STRING())) \
    .register_table_source('mySource')

t_env.connect(FileSystem().path('../production_data/kafkaoutput')) \
    .with_format(OldCsv()
                 .field('tbd', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field("tbd", DataTypes.STRING())) \
    .register_table_sink('mySink')

t_env.scan('mySource') \
    .insert_into('mySink')

t_env.execute("tutorial_job")
示例#24
0
exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)

input_path = './input/data.json'
output_path = './output/output.txt'
if os.path.exists(output_path):
    os.remove(output_path)

wpre.wcpretreatment(input_path)
input_path = './input'

t_env.connect(FileSystem().path(input_path)) \
    .with_format(OldCsv()
                 .field('dateRep', DataTypes.DATE(True))) \
    .with_schema(Schema()
                 .field('dateRep', DataTypes.DATE(True))) \
    .create_temporary_table('mySource')

t_env.connect(FileSystem().path(output_path)) \
    .with_format(OldCsv()
                 .field_delimiter('\t')
                 .field('dateRep', DataTypes.DATE(True))
                 .field('deaths_weekly', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('dateRep', DataTypes.DATE(True))
                 .field('deaths_weekly', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

t_env.from_path('mySource') \
示例#25
0
env = StreamExecutionEnvironment.get_execution_environment()
t_env = StreamTableEnvironment.create(
    env,
    environment_settings = EnvironmentSettings.new_instance()
    .use_blink_planner()
    .build(),
)

result_path = '/notebooks/output.csv'

print('Results directory:', result_path)

t_env.connect(FileSystem().path(result_path)).with_format(
    OldCsv()
    .field_delimiter(',')
    .field('word', DataTypes.STRING())
    .field('count', DataTypes.BIGINT())
).with_schema(
    Schema()
    .field('word', DataTypes.STRING())
    .field('count', DataTypes.BIGINT())
).register_table_sink(
    'Results'
)

elements = [(word, 1) for word in content.split(' ')]

t_env.from_elements(elements, ['word', 'count']).group_by('word').select(
    'word, count(1) as count'
).insert_into('Results')
    sentences = [string]

    x = np.zeros((len(sentences), maxlen))
    for i, sentence in enumerate(sentences):
        for no, k in enumerate(sentence.split()[:maxlen][::-1]):
            x[i, -1 - no] = dic.get(k, UNK)
    indices = np.argmax(sess.run(Y, feed_dict={X: x}), axis=1)
    return label[indices[0]]


t_env.set_python_requirements('/notebooks/requirements.txt')

t_env.register_function('predict', predict)

result_path = '/notebooks/output-tensorflow.csv'

t_env.connect(FileSystem().path(result_path)).with_format(
    OldCsv().field_delimiter(',').field('sentence', DataTypes.STRING()).field(
        'label', DataTypes.STRING())).with_schema(Schema().field(
            'sentence', DataTypes.STRING()).field(
                'label', DataTypes.STRING())).create_temporary_table('mySink')

elements = [(sentence, ) for sentence in content.split('\n')]

t_env.from_elements(
    elements,
    ['sentence']).select('sentence, predict(sentence)').insert_into('mySink')

t_env.execute('predict')