Пример #1
0
    def test_with_schema(self):
        descriptor = self.t_env.connect(FileSystem())

        descriptor = descriptor.with_format(OldCsv()).with_schema(
            Schema().field("a", "INT"))

        properties = descriptor.to_properties()
        expected = {
            'schema.0.name': 'a',
            'schema.0.data-type': 'INT',
            'format.type': 'csv',
            'format.property-version': '1',
            'connector.type': 'filesystem',
            'connector.property-version': '1'
        }
        assert properties == expected
Пример #2
0
    def test_in_append_mode(self):
        descriptor = self.t_env.connect(FileSystem())

        descriptor = descriptor\
            .with_format(OldCsv())\
            .in_append_mode()

        properties = descriptor.to_properties()
        expected = {
            'update-mode': 'append',
            'format.type': 'csv',
            'format.property-version': '1',
            'connector.property-version': '1',
            'connector.type': 'filesystem'
        }
        assert properties == expected
Пример #3
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename, e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    elements = [(word, 1) for word in content.split(" ")]

    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute("word_count")
Пример #4
0
    def test_field(self):
        csv = OldCsv()

        csv.field("a", DataTypes.BIGINT())
        csv.field("b", DataTypes.STRING())
        csv.field("c", "SQL_TIMESTAMP")

        properties = csv.to_properties()
        expected = {'format.fields.0.name': 'a',
                    'format.fields.0.data-type': 'BIGINT',
                    'format.fields.1.name': 'b',
                    'format.fields.1.data-type': 'VARCHAR(2147483647)',
                    'format.fields.2.name': 'c',
                    'format.fields.2.data-type': 'TIMESTAMP(3)',
                    'format.type': 'csv',
                    'format.property-version': '1'}
        self.assertEqual(expected, properties)
Пример #5
0
    def test_schema(self):
        csv = OldCsv()
        schema = TableSchema(["a", "b"], [DataTypes.INT(), DataTypes.STRING()])

        csv = csv.schema(schema)

        properties = csv.to_properties()
        expected = {
            'format.fields.0.name': 'a',
            'format.fields.0.data-type': 'INT',
            'format.fields.1.name': 'b',
            'format.fields.1.data-type': 'VARCHAR(2147483647)',
            'format.type': 'csv',
            'format.property-version': '1'
        }

        self.assertEqual(expected, properties)
Пример #6
0
    def execute(self, function_context: FlinkFunctionContext, input_table: Table) -> None:
        example_meta: af.ExampleMeta = function_context.get_example_meta()
        output_file = example_meta.batch_uri
        if os.path.exists(output_file):
            os.remove(output_file)

        t_env = function_context.get_table_env()
        statement_set = function_context.get_statement_set()
        t_env.connect(FileSystem().path(output_file)) \
            .with_format(OldCsv()
                         .field_delimiter('\t')
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .with_schema(Schema()
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .create_temporary_table('mySink')
        statement_set.add_insert('mySink', input_table)
Пример #7
0
    def process(self, execution_context: flink.ExecutionContext, input_list: List[Table] = None) -> List[Table]:
        output_file = os.path.join(os.getcwd(), 'output')
        if os.path.exists(output_file):
            os.remove(output_file)

        t_env = execution_context.table_env
        statement_set = execution_context.statement_set
        t_env.connect(FileSystem().path(output_file)) \
            .with_format(OldCsv()
                         .field_delimiter('\t')
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .with_schema(Schema()
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .create_temporary_table('mySink')
        statement_set.add_insert('mySink', input_list[0])
        return []
def word_count():
    f1 = open("/home/mnm/flink-1.9.1/1", "r")
    f2 = open("/home/mnm/flink-1.9.1/2", "r")
    f3 = open("/home/mnm/flink-1.9.1/3", "r")
    f4 = open("/home/mnm/flink-1.9.1/4", "r")
    f5 = open("/home/mnm/flink-1.9.1/5", "r")
    content = f1.read() + f2.read() + f3.read() + f4.read() + f5.read()

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute("Python batch word count")
Пример #9
0
class main():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)

    # t_env.connect(FileSystem().path('./temp/deviceorientation')) \
    #     .with_format(OldCsv()
    #                 .field('word', DataTypes.STRING())) \
    #     .with_schema(Schema()
    #                 .field('word', DataTypes.STRING())) \
    #     .create_temporary_table('mySource')
    my_source_ddl = """
        create table mySource (
            word VARCHAR
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = './temp/input'
        )
    """
    t_env.sql_update(my_source_ddl)

    t_env.connect(FileSystem().path('/tmp/output')) \
        .with_format(OldCsv()
                    .field_delimiter('\t')
                    .field('word', DataTypes.STRING())
                    .field('count', DataTypes.BIGINT())) \
        .with_schema(Schema()
                    .field('word', DataTypes.STRING())
                    .field('count', DataTypes.BIGINT())) \
        .create_temporary_table('mySink')
    t_env.from_path('mySource') \
        .group_by('word') \
        .select('word, count(1)') \
        .insert_into('mySink')

    t_env.execute("tutorial_job")
Пример #10
0
 def execute(self, function_context: FlinkFunctionContext,
             input_list: List[Table]) -> List[Table]:
     t_env = function_context.get_table_env()
     statement_set = function_context.get_statement_set()
     table = input_list[0]
     t_env.register_function(
         "build_index",
         udf(BuildIndexUDF(self.path, self.element_type, self.dimension),
             [DataTypes.STRING(), DataTypes.STRING()], DataTypes.STRING()))
     dummy_output_path = '/tmp/indexed_key'
     if os.path.exists(dummy_output_path):
         if os.path.isdir(dummy_output_path):
             shutil.rmtree(dummy_output_path)
         else:
             os.remove(dummy_output_path)
     t_env.connect(FileSystem().path(dummy_output_path)) \
         .with_format(OldCsv()
                      .field('key', DataTypes.STRING())) \
         .with_schema(Schema()
                      .field('key', DataTypes.STRING())) \
         .create_temporary_table('train_sink')
     statement_set.add_insert(
         "train_sink", table.select("build_index(uuid, feature_data)"))
     return []
    )
).with_schema(
    Schema()
    .field('datetime', DataTypes.STRING())
    .field('text', DataTypes.STRING())
).in_append_mode().register_table_source(
    'source'
)


result_path = '/notebooks/output-tensorflow.csv'

t_env.connect(FileSystem().path(result_path)).with_format(
    OldCsv()
    .field_delimiter(',')
    .field('datetime', DataTypes.STRING())
    .field('sentence', DataTypes.STRING())
    .field('label', DataTypes.STRING())
).with_schema(
    Schema()
    .field('datetime', DataTypes.STRING())
    .field('sentence', DataTypes.STRING())
    .field('label', DataTypes.STRING())
).in_append_mode().register_table_sink(
    'sink'
)

st_env.from_path('source').select(
    'datetime, sentence, predict(sentence)'
).insert_into('sink')
Пример #12
0
table_config.set_null_check(False)

# Create table environment where the source and sink tables will be registered
table_env = StreamTableEnvironment.create(env, table_config)

from pyflink.table.descriptors import Kafka, Json, OldCsv, Schema, FileSystem

directories = ['/flink/lib']
for directory in directories:
    for jar in glob.glob(os.path.join(directory, '*.jar')):
        sys.path.append(jar)

# from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer11
# from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer09

OldCsv()
print("debug 010")

Kafka()
print("debug 020")
Json()
print("debug 030")


sourcetable = table_env \
    .connect(Kafka()
             .properties({'update-mode': 'append', 'connector.topic': 'machine.data',
                          'connector.properties.zookeeper.connect': 'localhost:2181',
                          'connector.properties.bootstrap.servers.': 'localhost:9092'})) \
    .with_format(Json().
                 json_schema(
Пример #13
0
from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem

exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(2)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)

t_env.connect(FileSystem().path('input')) \
    .with_format(OldCsv()
                 .line_delimiter(' ')
                 .field('word', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())) \
    .register_table_source("inputSource")

t_env.connect(FileSystem().path('output')) \
    .with_format(OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\
    .with_schema(Schema().field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\
    .register_table_sink('sink')

t_env.scan('inputSource').group_by('word').select('word, count(1)').insert_into('sink')

t_env.execute('my first job')
Пример #14
0
def word_count():
    environment_settings = EnvironmentSettings.new_instance().in_batch_mode(
    ).use_blink_planner().build()
    t_env = BatchTableEnvironment.create(
        environment_settings=environment_settings)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    # we should set the Python verison here if `Python` not point
    t_env.get_config().set_python_executable("python3")

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("city", DataTypes.STRING())
                     .field("sales_volume", DataTypes.BIGINT())
                     .field("sales", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("city", DataTypes.STRING())
                     .field("sales_volume", DataTypes.BIGINT())
                     .field("sales", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    @udf(input_types=DataTypes.STRING(),
         result_type=DataTypes.ARRAY(DataTypes.STRING()))
    def split(input_str: str):
        return input_str.split(",")

    @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()),
                      DataTypes.INT()],
         result_type=DataTypes.STRING())
    def get(arr, index):
        return arr[index]

    t_env.register_function("split", split)
    t_env.register_function("get", get)

    t_env.get_config().get_configuration().set_string("parallelism.default",
                                                      "1")

    data = [
        ("iPhone 11,30,5499,Beijing", ), ("iPhone 11 Pro,20,8699,Guangzhou", ),
        ("MacBook Pro,10,9999,Beijing", ), ("AirPods Pro,50,1999,Beijing", ),
        ("MacBook Pro,10,11499,Shanghai", ), ("iPhone 11,30,5999,Shanghai", ),
        ("iPhone 11 Pro,20,9999,Shenzhen", ),
        ("MacBook Pro,10,13899,Hangzhou", ), ("iPhone 11,10,6799,Beijing", ),
        ("MacBook Pro,10,18999,Beijing", ),
        ("iPhone 11 Pro,10,11799,Shenzhen", ),
        ("MacBook Pro,10,22199,Shanghai", ), ("AirPods Pro,40,1999,Shanghai", )
    ]
    t_env.from_elements(data, ["line"]) \
        .select("split(line) as str_array") \
        .select("get(str_array, 3) as city, "
                "get(str_array, 1).cast(LONG) as count, "
                "get(str_array, 2).cast(LONG) as unit_price") \
        .select("city, count, count * unit_price as total_price") \
        .group_by("city") \
        .select("city, "
                "sum(count) as sales_volume, "
                "sum(total_price) as sales") \
        .insert_into("Results")

    t_env.execute("word_count")
Пример #15
0
st_env.get_config().get_configuration().set_string(
    "pipeline.jars",
    "file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka-base_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-jdbc_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-sql-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/kafka-clients-2.1.0.jar"
)

#读kafka
properties = {
    "zookeeper.connect": "nn1.hadoop:2181,nn2.hadoop:2181,s1.hadoop:2181",
    "bootstrap.servers": "nn1.hadoop:9092,nn2.hadoop:9092,s1.hadoop:9092",
    "group.id": "testGroup"
}
st_env.connect(Kafka().properties(properties).version("universal").topic("test").start_from_latest()) \
    .with_format(Json()).with_schema(Schema() \
        .field('throughputReqMax', DataTypes.BIGINT()) \
        .field('throughputReqTotal', DataTypes.BIGINT())) \
    .create_temporary_table('mySource')

#写入csv
st_env.connect(FileSystem().path('/usr/local/flink/test/result3.txt')) \
    .with_format(OldCsv()
                .field('sub', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('sub', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

#读取kafka数据中的a和b字段相加再乘以2 , 并插入sink
st_env.from_path('mySource')\
    .select("(throughputReqTotal-throughputReqMax)") \
    .insert_into('mySink')
st_env.execute("job_test")
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = StreamTableEnvironment.create(exec_env, t_config)

t_env.connect(Kafka()
              .version("0.11")
              .topic("test")
              .property("zookeeper.connect", "localhost:2181")
              .property("bootstrap.servers", "localhost:9092")
              ) \
    .in_append_mode() \
    .with_format(Csv()
                 .line_delimiter("\r\n")      \
                 .derive_schema()) \
    .with_schema(Schema()
                 .field("tbd", DataTypes.INT())) \
    .register_table_source('mySource')

t_env.connect(FileSystem().path('../production_data/kafkaoutput')) \
    .with_format(OldCsv()
                 .field('tbd', DataTypes.INT())) \
    .with_schema(Schema()
                 .field("tbd", DataTypes.INT())) \
    .register_table_sink('mySink')

t_env.scan('mySource') \
    .select('tbd') \
    .where("tbd = 1") \
    .insert_into('mySink')

t_env.execute("tutorial_job")
Пример #17
0
env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)


@udf(input_types=[DataTypes.DECIMAL(38, 12, nullable=True)],
     result_type=DataTypes.DECIMAL(38, 12, nullable=True))
def myadd(i):
    return i * i * 2


# add = udf(myadd, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT())

t_env.register_function("add", myadd)

t_env.connect(FileSystem().path('/tmp/input')) \
    .with_format(OldCsv()
                 .field('b', DataTypes.DECIMAL(38,12,nullable=True))) \
    .with_schema(Schema()
                 .field('b', DataTypes.DECIMAL(38,12,nullable=True))) \
    .create_temporary_table('mySource')

t_env.connect(FileSystem().path('/tmp/output')) \
    .with_format(OldCsv()
                 .field('sum', DataTypes.DECIMAL(38,12,nullable=True))) \
    .with_schema(Schema()
                 .field('sum', DataTypes.DECIMAL(38,12,nullable=True))) \
    .create_temporary_table('mySink')

t_env.from_path('mySource')\
    .select("add(b)") \
    .insert_into('mySink')
Пример #18
0
        # 路径可以是绝对路径或相对路径,但注意路径前面不需要 file://
        t_env.set_python_requirements(dir_requirements, dir_cache)
    else:
        # 方式 2:指定描述依赖的依赖文件 requirements.txt,作业运行时下载,不推荐。
        t_env.set_python_requirements(dir_requirements)

# ########################### 创建源表(source) ###########################

dir_log = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                       'syslog.text')

# 基于 Table API 创建 source 表
# source 表只有 1 列,名为 line
t_env.connect(FileSystem().path(dir_log)) \
    .with_format(OldCsv()
                 .line_delimiter('\n')
                 .field('line', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field('line', DataTypes.STRING())) \
    .create_temporary_table('source')

# ########################### 创建结果表(sink) ###########################

dir_result = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                          'result.csv')

# 如果文件/文件夹存在,则删除
if os.path.exists(dir_result):
    if os.path.isfile(dir_result):
        os.remove(dir_result)
    else:
Пример #19
0
t_env = StreamTableEnvironment.create(env)

# register function
t_env.register_function("add1", add1)
t_env.register_function("add2", add2)
t_env.register_function("add3", add3)
t_env.register_function("add4", add4)

t = t_env.from_elements([(1, 2, 'Welcome'), (2, 3, 'To'), (3, 4, 'PyFlink')],
                        ['a', 'b', 'c'])

t_env.connect(FileSystem().path(sink_path))\
    .with_format(OldCsv()
        .field_delimiter(',')
        .field("add1", DataTypes.BIGINT())
        .field("add2", DataTypes.BIGINT())
        .field("add3", DataTypes.BIGINT())
        .field("add4", DataTypes.BIGINT())
        .field("b", DataTypes.BIGINT())
        .field("c", DataTypes.STRING()))\
    .with_schema(Schema()
        .field("add1", DataTypes.BIGINT())
        .field("add2", DataTypes.BIGINT())
        .field("add3", DataTypes.BIGINT())
        .field("add4", DataTypes.BIGINT())
        .field("b", DataTypes.BIGINT())
        .field("c", DataTypes.STRING()))\
    .register_table_sink("pyflink_sink")

t.select("add1(a, b), add2(a, b), add3(a, b), add4(a, b), b, c").insert_into(
    "pyflink_sink")
Пример #20
0
import pyflink as fk
from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem

exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)
# 连接器输入
t_env.connect(FileSystem().path("./data/in")).with_format(OldCsv().field("word",DataTypes.STRING()))\
    .with_schema(Schema().field('word',DataTypes.STRING()))\
    .create_temporary_table('mySource')
t_env.connect(FileSystem().path("./data/out"))\
    .with_format(OldCsv().field_delimiter('\t').field('word',DataTypes.STRING()).field('count',DataTypes.BIGINT()))\
    .with_schema(Schema().field('word',DataTypes.STRING()).field('count',DataTypes.BIGINT()))\
    .create_temporary_table("mySink")
my_source_ddl = """
    create table mySource (
        word VARCHAR
    ) with (
        'connector.type' = 'filesystem',
        'format.type' = 'csv',
        'connector.path' = './data/in/in.csv'
    )
"""

my_sink_ddl = """
    create table mySink (
        word VARCHAR,
        `count` BIGINT
Пример #21
0
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes
from pyflink.table.descriptors import Schema, OldCsv, FileSystem
from pyflink.table.udf import udf

env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)

add = udf(lambda i, j: i + j, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT())
t_env.register_function("add", add)

t_env.connect(FileSystem().path('/opt/examples/data/udf_add_input')) \
    .with_format(OldCsv()
                 .field('a', DataTypes.BIGINT())
                 .field('b', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('a', DataTypes.BIGINT())
                 .field('b', DataTypes.BIGINT())) \
    .create_temporary_table('mySource')

t_env.connect(FileSystem().path('/opt/examples/data/udf_add_output')) \
    .with_format(OldCsv()
                 .field('sum', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('sum', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

t_env.from_path('mySource')\
    .select("add(a, b)") \
    .insert_into('mySink')
Пример #22
0
import tempfile

from pyflink.dataset import ExecutionEnvironment
from pyflink.table import BatchTableEnvironment, TableConfig, WriteMode
from pyflink.table.descriptors import FileSystem, OldCsv, Schema
from pyflink.table.types import DataTypes

t_config = TableConfig()
env = ExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = BatchTableEnvironment.create(env, t_config)

source_file = '/notebooks/big-text.txt'
sink_file = '/notebooks/sink.csv'

t_env.connect(FileSystem().path(source_file)).with_format(
    OldCsv().line_delimiter('\n').field(
        'word', DataTypes.STRING())).with_schema(Schema().field(
            'word', DataTypes.STRING())).register_table_source('mySource')

t_env.connect(FileSystem().path(sink_file)).with_format(
    OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field(
        'count', DataTypes.BIGINT())).with_schema(Schema().field(
            'word', DataTypes.STRING()).field(
                'count', DataTypes.BIGINT())).register_table_sink('mySink')

t_env.scan('mySource').group_by('word').select('word, count(1)').insert_into(
    'mySink')

t_env.execute('wordcount')
Пример #23
0
exec_env = StreamExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = StreamTableEnvironment.create(exec_env, t_config)

t_env.connect(Kafka()
              .version("0.11")
              .topic("test")
              .property("zookeeper.connect", "localhost:2181")
              .property("bootstrap.servers", "localhost:9092")
              ) \
    .in_append_mode() \
    .with_format(Csv()
                 .line_delimiter("\r\n")      \
                 .derive_schema()) \
    .with_schema(Schema()
                 .field("tbd", DataTypes.STRING())) \
    .register_table_source('mySource')

t_env.connect(FileSystem().path('../production_data/kafkaoutput')) \
    .with_format(OldCsv()
                 .field('tbd', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field("tbd", DataTypes.STRING())) \
    .register_table_sink('mySink')

t_env.scan('mySource') \
    .insert_into('mySink')

t_env.execute("tutorial_job")
Пример #24
0
exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)

input_path = './input/data.json'
output_path = './output/output.txt'
if os.path.exists(output_path):
    os.remove(output_path)

wpre.wcpretreatment(input_path)
input_path = './input'

t_env.connect(FileSystem().path(input_path)) \
    .with_format(OldCsv()
                 .field('dateRep', DataTypes.DATE(True))) \
    .with_schema(Schema()
                 .field('dateRep', DataTypes.DATE(True))) \
    .create_temporary_table('mySource')

t_env.connect(FileSystem().path(output_path)) \
    .with_format(OldCsv()
                 .field_delimiter('\t')
                 .field('dateRep', DataTypes.DATE(True))
                 .field('deaths_weekly', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('dateRep', DataTypes.DATE(True))
                 .field('deaths_weekly', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

t_env.from_path('mySource') \
Пример #25
0
env = StreamExecutionEnvironment.get_execution_environment()
t_env = StreamTableEnvironment.create(
    env,
    environment_settings = EnvironmentSettings.new_instance()
    .use_blink_planner()
    .build(),
)

result_path = '/notebooks/output.csv'

print('Results directory:', result_path)

t_env.connect(FileSystem().path(result_path)).with_format(
    OldCsv()
    .field_delimiter(',')
    .field('word', DataTypes.STRING())
    .field('count', DataTypes.BIGINT())
).with_schema(
    Schema()
    .field('word', DataTypes.STRING())
    .field('count', DataTypes.BIGINT())
).register_table_sink(
    'Results'
)

elements = [(word, 1) for word in content.split(' ')]

t_env.from_elements(elements, ['word', 'count']).group_by('word').select(
    'word, count(1) as count'
).insert_into('Results')
    sentences = [string]

    x = np.zeros((len(sentences), maxlen))
    for i, sentence in enumerate(sentences):
        for no, k in enumerate(sentence.split()[:maxlen][::-1]):
            x[i, -1 - no] = dic.get(k, UNK)
    indices = np.argmax(sess.run(Y, feed_dict={X: x}), axis=1)
    return label[indices[0]]


t_env.set_python_requirements('/notebooks/requirements.txt')

t_env.register_function('predict', predict)

result_path = '/notebooks/output-tensorflow.csv'

t_env.connect(FileSystem().path(result_path)).with_format(
    OldCsv().field_delimiter(',').field('sentence', DataTypes.STRING()).field(
        'label', DataTypes.STRING())).with_schema(Schema().field(
            'sentence', DataTypes.STRING()).field(
                'label', DataTypes.STRING())).create_temporary_table('mySink')

elements = [(sentence, ) for sentence in content.split('\n')]

t_env.from_elements(
    elements,
    ['sentence']).select('sentence, predict(sentence)').insert_into('mySink')

t_env.execute('predict')