Python OldCsv.OldCsv примеры, pyflink.table.descriptors.OldCsv.OldCsv Python примеры использования

Пример #1

0

Показать файл

    def test_with_schema(self):
        descriptor = self.t_env.connect(FileSystem())

        descriptor = descriptor.with_format(OldCsv()).with_schema(
            Schema().field("a", "INT"))

        properties = descriptor.to_properties()
        expected = {
            'schema.0.name': 'a',
            'schema.0.data-type': 'INT',
            'format.type': 'csv',
            'format.property-version': '1',
            'connector.type': 'filesystem',
            'connector.property-version': '1'
        }
        assert properties == expected

Пример #2

0

Показать файл

    def test_in_append_mode(self):
        descriptor = self.t_env.connect(FileSystem())

        descriptor = descriptor\
            .with_format(OldCsv())\
            .in_append_mode()

        properties = descriptor.to_properties()
        expected = {
            'update-mode': 'append',
            'format.type': 'csv',
            'format.property-version': '1',
            'connector.property-version': '1',
            'connector.type': 'filesystem'
        }
        assert properties == expected

Пример #3

0

Показать файл

Файл: word_count_cli.py Проект: yxvine/enjoyment.code

def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename, e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    elements = [(word, 1) for word in content.split(" ")]

    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute("word_count")

Пример #4

0

Показать файл

Файл: test_descriptor.py Проект: zhaoawd/flink

    def test_field(self):
        csv = OldCsv()

        csv.field("a", DataTypes.BIGINT())
        csv.field("b", DataTypes.STRING())
        csv.field("c", "SQL_TIMESTAMP")

        properties = csv.to_properties()
        expected = {'format.fields.0.name': 'a',
                    'format.fields.0.data-type': 'BIGINT',
                    'format.fields.1.name': 'b',
                    'format.fields.1.data-type': 'VARCHAR(2147483647)',
                    'format.fields.2.name': 'c',
                    'format.fields.2.data-type': 'TIMESTAMP(3)',
                    'format.type': 'csv',
                    'format.property-version': '1'}
        self.assertEqual(expected, properties)

Пример #5

0

Показать файл

Файл: test_descriptor.py Проект: zyset/flink-1

    def test_schema(self):
        csv = OldCsv()
        schema = TableSchema(["a", "b"], [DataTypes.INT(), DataTypes.STRING()])

        csv = csv.schema(schema)

        properties = csv.to_properties()
        expected = {
            'format.fields.0.name': 'a',
            'format.fields.0.data-type': 'INT',
            'format.fields.1.name': 'b',
            'format.fields.1.data-type': 'VARCHAR(2147483647)',
            'format.type': 'csv',
            'format.property-version': '1'
        }

        self.assertEqual(expected, properties)

Пример #6

0

Показать файл

    def execute(self, function_context: FlinkFunctionContext, input_table: Table) -> None:
        example_meta: af.ExampleMeta = function_context.get_example_meta()
        output_file = example_meta.batch_uri
        if os.path.exists(output_file):
            os.remove(output_file)

        t_env = function_context.get_table_env()
        statement_set = function_context.get_statement_set()
        t_env.connect(FileSystem().path(output_file)) \
            .with_format(OldCsv()
                         .field_delimiter('\t')
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .with_schema(Schema()
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .create_temporary_table('mySink')
        statement_set.add_insert('mySink', input_table)

Пример #7

0

Показать файл

    def process(self, execution_context: flink.ExecutionContext, input_list: List[Table] = None) -> List[Table]:
        output_file = os.path.join(os.getcwd(), 'output')
        if os.path.exists(output_file):
            os.remove(output_file)

        t_env = execution_context.table_env
        statement_set = execution_context.statement_set
        t_env.connect(FileSystem().path(output_file)) \
            .with_format(OldCsv()
                         .field_delimiter('\t')
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .with_schema(Schema()
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .create_temporary_table('mySink')
        statement_set.add_insert('mySink', input_list[0])
        return []

Пример #8

0

Показать файл

Файл: My_word_count.py Проект: mnm1331/flink-streaming-batch-example

def word_count():
    f1 = open("/home/mnm/flink-1.9.1/1", "r")
    f2 = open("/home/mnm/flink-1.9.1/2", "r")
    f3 = open("/home/mnm/flink-1.9.1/3", "r")
    f4 = open("/home/mnm/flink-1.9.1/4", "r")
    f5 = open("/home/mnm/flink-1.9.1/5", "r")
    content = f1.read() + f2.read() + f3.read() + f4.read() + f5.read()

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute("Python batch word count")

Пример #9

0

Показать файл

class main():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)

    # t_env.connect(FileSystem().path('./temp/deviceorientation')) \
    #     .with_format(OldCsv()
    #                 .field('word', DataTypes.STRING())) \
    #     .with_schema(Schema()
    #                 .field('word', DataTypes.STRING())) \
    #     .create_temporary_table('mySource')
    my_source_ddl = """
        create table mySource (
            word VARCHAR
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = './temp/input'
        )
    """
    t_env.sql_update(my_source_ddl)

    t_env.connect(FileSystem().path('/tmp/output')) \
        .with_format(OldCsv()
                    .field_delimiter('\t')
                    .field('word', DataTypes.STRING())
                    .field('count', DataTypes.BIGINT())) \
        .with_schema(Schema()
                    .field('word', DataTypes.STRING())
                    .field('count', DataTypes.BIGINT())) \
        .create_temporary_table('mySink')
    t_env.from_path('mySource') \
        .group_by('word') \
        .select('word, count(1)') \
        .insert_into('mySink')

    t_env.execute("tutorial_job")

Пример #10

0

Показать файл

 def execute(self, function_context: FlinkFunctionContext,
             input_list: List[Table]) -> List[Table]:
     t_env = function_context.get_table_env()
     statement_set = function_context.get_statement_set()
     table = input_list[0]
     t_env.register_function(
         "build_index",
         udf(BuildIndexUDF(self.path, self.element_type, self.dimension),
             [DataTypes.STRING(), DataTypes.STRING()], DataTypes.STRING()))
     dummy_output_path = '/tmp/indexed_key'
     if os.path.exists(dummy_output_path):
         if os.path.isdir(dummy_output_path):
             shutil.rmtree(dummy_output_path)
         else:
             os.remove(dummy_output_path)
     t_env.connect(FileSystem().path(dummy_output_path)) \
         .with_format(OldCsv()
                      .field('key', DataTypes.STRING())) \
         .with_schema(Schema()
                      .field('key', DataTypes.STRING())) \
         .create_temporary_table('train_sink')
     statement_set.add_insert(
         "train_sink", table.select("build_index(uuid, feature_data)"))
     return []

Пример #11

0

Показать файл

Файл: tensorflow_predict.py Проект: MLHafizur/Gather-Deployment

    )
).with_schema(
    Schema()
    .field('datetime', DataTypes.STRING())
    .field('text', DataTypes.STRING())
).in_append_mode().register_table_source(
    'source'
)


result_path = '/notebooks/output-tensorflow.csv'

t_env.connect(FileSystem().path(result_path)).with_format(
    OldCsv()
    .field_delimiter(',')
    .field('datetime', DataTypes.STRING())
    .field('sentence', DataTypes.STRING())
    .field('label', DataTypes.STRING())
).with_schema(
    Schema()
    .field('datetime', DataTypes.STRING())
    .field('sentence', DataTypes.STRING())
    .field('label', DataTypes.STRING())
).in_append_mode().register_table_sink(
    'sink'
)

st_env.from_path('source').select(
    'datetime, sentence, predict(sentence)'
).insert_into('sink')

Пример #12

0

Показать файл

table_config.set_null_check(False)

# Create table environment where the source and sink tables will be registered
table_env = StreamTableEnvironment.create(env, table_config)

from pyflink.table.descriptors import Kafka, Json, OldCsv, Schema, FileSystem

directories = ['/flink/lib']
for directory in directories:
    for jar in glob.glob(os.path.join(directory, '*.jar')):
        sys.path.append(jar)

# from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer11
# from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer09

OldCsv()
print("debug 010")

Kafka()
print("debug 020")
Json()
print("debug 030")


sourcetable = table_env \
    .connect(Kafka()
             .properties({'update-mode': 'append', 'connector.topic': 'machine.data',
                          'connector.properties.zookeeper.connect': 'localhost:2181',
                          'connector.properties.bootstrap.servers.': 'localhost:9092'})) \
    .with_format(Json().
                 json_schema(

Пример #13

0

Показать файл

from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem

exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(2)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)

t_env.connect(FileSystem().path('input')) \
    .with_format(OldCsv()
                 .line_delimiter(' ')
                 .field('word', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())) \
    .register_table_source("inputSource")

t_env.connect(FileSystem().path('output')) \
    .with_format(OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\
    .with_schema(Schema().field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\
    .register_table_sink('sink')

t_env.scan('inputSource').group_by('word').select('word, count(1)').insert_into('sink')

t_env.execute('my first job')

Пример #14

0

Показать файл

Файл: pyflink_wordcount.py Проект: mahaoyang/MovieLens

def word_count():
    environment_settings = EnvironmentSettings.new_instance().in_batch_mode(
    ).use_blink_planner().build()
    t_env = BatchTableEnvironment.create(
        environment_settings=environment_settings)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    # we should set the Python verison here if `Python` not point
    t_env.get_config().set_python_executable("python3")

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("city", DataTypes.STRING())
                     .field("sales_volume", DataTypes.BIGINT())
                     .field("sales", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("city", DataTypes.STRING())
                     .field("sales_volume", DataTypes.BIGINT())
                     .field("sales", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    @udf(input_types=DataTypes.STRING(),
         result_type=DataTypes.ARRAY(DataTypes.STRING()))
    def split(input_str: str):
        return input_str.split(",")

    @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()),
                      DataTypes.INT()],
         result_type=DataTypes.STRING())
    def get(arr, index):
        return arr[index]

    t_env.register_function("split", split)
    t_env.register_function("get", get)

    t_env.get_config().get_configuration().set_string("parallelism.default",
                                                      "1")

    data = [
        ("iPhone 11,30,5499,Beijing", ), ("iPhone 11 Pro,20,8699,Guangzhou", ),
        ("MacBook Pro,10,9999,Beijing", ), ("AirPods Pro,50,1999,Beijing", ),
        ("MacBook Pro,10,11499,Shanghai", ), ("iPhone 11,30,5999,Shanghai", ),
        ("iPhone 11 Pro,20,9999,Shenzhen", ),
        ("MacBook Pro,10,13899,Hangzhou", ), ("iPhone 11,10,6799,Beijing", ),
        ("MacBook Pro,10,18999,Beijing", ),
        ("iPhone 11 Pro,10,11799,Shenzhen", ),
        ("MacBook Pro,10,22199,Shanghai", ), ("AirPods Pro,40,1999,Shanghai", )
    ]
    t_env.from_elements(data, ["line"]) \
        .select("split(line) as str_array") \
        .select("get(str_array, 3) as city, "
                "get(str_array, 1).cast(LONG) as count, "
                "get(str_array, 2).cast(LONG) as unit_price") \
        .select("city, count, count * unit_price as total_price") \
        .group_by("city") \
        .select("city, "
                "sum(count) as sales_volume, "
                "sum(total_price) as sales") \
        .insert_into("Results")

    t_env.execute("word_count")

Пример #15

0

Показать файл

st_env.get_config().get_configuration().set_string(
    "pipeline.jars",
    "file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka-base_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-jdbc_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-sql-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/kafka-clients-2.1.0.jar"
)

#读kafka
properties = {
    "zookeeper.connect": "nn1.hadoop:2181,nn2.hadoop:2181,s1.hadoop:2181",
    "bootstrap.servers": "nn1.hadoop:9092,nn2.hadoop:9092,s1.hadoop:9092",
    "group.id": "testGroup"
}
st_env.connect(Kafka().properties(properties).version("universal").topic("test").start_from_latest()) \
    .with_format(Json()).with_schema(Schema() \
        .field('throughputReqMax', DataTypes.BIGINT()) \
        .field('throughputReqTotal', DataTypes.BIGINT())) \
    .create_temporary_table('mySource')

#写入csv
st_env.connect(FileSystem().path('/usr/local/flink/test/result3.txt')) \
    .with_format(OldCsv()
                .field('sub', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('sub', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

#读取kafka数据中的a和b字段相加再乘以2 , 并插入sink
st_env.from_path('mySource')\
    .select("(throughputReqTotal-throughputReqMax)") \
    .insert_into('mySink')
st_env.execute("job_test")

Пример #16

0

Показать файл

Файл: demo_kafka2flink_transformation.py Проект: youngsheep7/Flink_with_Python

exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = StreamTableEnvironment.create(exec_env, t_config)

t_env.connect(Kafka()
              .version("0.11")
              .topic("test")
              .property("zookeeper.connect", "localhost:2181")
              .property("bootstrap.servers", "localhost:9092")
              ) \
    .in_append_mode() \
    .with_format(Csv()
                 .line_delimiter("\r\n")      \
                 .derive_schema()) \
    .with_schema(Schema()
                 .field("tbd", DataTypes.INT())) \
    .register_table_source('mySource')

t_env.connect(FileSystem().path('../production_data/kafkaoutput')) \
    .with_format(OldCsv()
                 .field('tbd', DataTypes.INT())) \
    .with_schema(Schema()
                 .field("tbd", DataTypes.INT())) \
    .register_table_sink('mySink')

t_env.scan('mySource') \
    .select('tbd') \
    .where("tbd = 1") \
    .insert_into('mySink')

t_env.execute("tutorial_job")

Пример #17

0

Показать файл

env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)


@udf(input_types=[DataTypes.DECIMAL(38, 12, nullable=True)],
     result_type=DataTypes.DECIMAL(38, 12, nullable=True))
def myadd(i):
    return i * i * 2


# add = udf(myadd, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT())

t_env.register_function("add", myadd)

t_env.connect(FileSystem().path('/tmp/input')) \
    .with_format(OldCsv()
                 .field('b', DataTypes.DECIMAL(38,12,nullable=True))) \
    .with_schema(Schema()
                 .field('b', DataTypes.DECIMAL(38,12,nullable=True))) \
    .create_temporary_table('mySource')

t_env.connect(FileSystem().path('/tmp/output')) \
    .with_format(OldCsv()
                 .field('sum', DataTypes.DECIMAL(38,12,nullable=True))) \
    .with_schema(Schema()
                 .field('sum', DataTypes.DECIMAL(38,12,nullable=True))) \
    .create_temporary_table('mySink')

t_env.from_path('mySource')\
    .select("add(b)") \
    .insert_into('mySink')

Пример #18

0

Показать файл

Файл: batch.py Проект: zephyrGit/pyflink_learn

        # 路径可以是绝对路径或相对路径，但注意路径前面不需要 file://
        t_env.set_python_requirements(dir_requirements, dir_cache)
    else:
        # 方式 2：指定描述依赖的依赖文件 requirements.txt，作业运行时下载，不推荐。
        t_env.set_python_requirements(dir_requirements)

# ########################### 创建源表(source) ###########################

dir_log = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                       'syslog.text')

# 基于 Table API 创建 source 表
# source 表只有 1 列，名为 line
t_env.connect(FileSystem().path(dir_log)) \
    .with_format(OldCsv()
                 .line_delimiter('\n')
                 .field('line', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field('line', DataTypes.STRING())) \
    .create_temporary_table('source')

# ########################### 创建结果表(sink) ###########################

dir_result = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                          'result.csv')

# 如果文件/文件夹存在，则删除
if os.path.exists(dir_result):
    if os.path.isfile(dir_result):
        os.remove(dir_result)
    else:

Пример #19

0

Показать файл

Файл: demo.py Проект: yxvine/enjoyment.code

t_env = StreamTableEnvironment.create(env)

# register function
t_env.register_function("add1", add1)
t_env.register_function("add2", add2)
t_env.register_function("add3", add3)
t_env.register_function("add4", add4)

t = t_env.from_elements([(1, 2, 'Welcome'), (2, 3, 'To'), (3, 4, 'PyFlink')],
                        ['a', 'b', 'c'])

t_env.connect(FileSystem().path(sink_path))\
    .with_format(OldCsv()
        .field_delimiter(',')
        .field("add1", DataTypes.BIGINT())
        .field("add2", DataTypes.BIGINT())
        .field("add3", DataTypes.BIGINT())
        .field("add4", DataTypes.BIGINT())
        .field("b", DataTypes.BIGINT())
        .field("c", DataTypes.STRING()))\
    .with_schema(Schema()
        .field("add1", DataTypes.BIGINT())
        .field("add2", DataTypes.BIGINT())
        .field("add3", DataTypes.BIGINT())
        .field("add4", DataTypes.BIGINT())
        .field("b", DataTypes.BIGINT())
        .field("c", DataTypes.STRING()))\
    .register_table_sink("pyflink_sink")

t.select("add1(a, b), add2(a, b), add3(a, b), add4(a, b), b, c").insert_into(
    "pyflink_sink")

Пример #20

0

Показать файл

import pyflink as fk
from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem

exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)
# 连接器输入
t_env.connect(FileSystem().path("./data/in")).with_format(OldCsv().field("word",DataTypes.STRING()))\
    .with_schema(Schema().field('word',DataTypes.STRING()))\
    .create_temporary_table('mySource')
t_env.connect(FileSystem().path("./data/out"))\
    .with_format(OldCsv().field_delimiter('\t').field('word',DataTypes.STRING()).field('count',DataTypes.BIGINT()))\
    .with_schema(Schema().field('word',DataTypes.STRING()).field('count',DataTypes.BIGINT()))\
    .create_temporary_table("mySink")
my_source_ddl = """
    create table mySource (
        word VARCHAR
    ) with (
        'connector.type' = 'filesystem',
        'format.type' = 'csv',
        'connector.path' = './data/in/in.csv'
    )
"""

my_sink_ddl = """
    create table mySink (
        word VARCHAR,
        `count` BIGINT

Пример #21

0

Показать файл

Файл: 3-udf_add.py Проект: wupengbo125/penter

from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes
from pyflink.table.descriptors import Schema, OldCsv, FileSystem
from pyflink.table.udf import udf

env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)

add = udf(lambda i, j: i + j, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT())
t_env.register_function("add", add)

t_env.connect(FileSystem().path('/opt/examples/data/udf_add_input')) \
    .with_format(OldCsv()
                 .field('a', DataTypes.BIGINT())
                 .field('b', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('a', DataTypes.BIGINT())
                 .field('b', DataTypes.BIGINT())) \
    .create_temporary_table('mySource')

t_env.connect(FileSystem().path('/opt/examples/data/udf_add_output')) \
    .with_format(OldCsv()
                 .field('sum', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('sum', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

t_env.from_path('mySource')\
    .select("add(a, b)") \
    .insert_into('mySink')

Пример #22

0

Показать файл

import tempfile

from pyflink.dataset import ExecutionEnvironment
from pyflink.table import BatchTableEnvironment, TableConfig, WriteMode
from pyflink.table.descriptors import FileSystem, OldCsv, Schema
from pyflink.table.types import DataTypes

t_config = TableConfig()
env = ExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = BatchTableEnvironment.create(env, t_config)

source_file = '/notebooks/big-text.txt'
sink_file = '/notebooks/sink.csv'

t_env.connect(FileSystem().path(source_file)).with_format(
    OldCsv().line_delimiter('\n').field(
        'word', DataTypes.STRING())).with_schema(Schema().field(
            'word', DataTypes.STRING())).register_table_source('mySource')

t_env.connect(FileSystem().path(sink_file)).with_format(
    OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field(
        'count', DataTypes.BIGINT())).with_schema(Schema().field(
            'word', DataTypes.STRING()).field(
                'count', DataTypes.BIGINT())).register_table_sink('mySink')

t_env.scan('mySource').group_by('word').select('word, count(1)').insert_into(
    'mySink')

t_env.execute('wordcount')

Пример #23

0

Показать файл

exec_env = StreamExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = StreamTableEnvironment.create(exec_env, t_config)

t_env.connect(Kafka()
              .version("0.11")
              .topic("test")
              .property("zookeeper.connect", "localhost:2181")
              .property("bootstrap.servers", "localhost:9092")
              ) \
    .in_append_mode() \
    .with_format(Csv()
                 .line_delimiter("\r\n")      \
                 .derive_schema()) \
    .with_schema(Schema()
                 .field("tbd", DataTypes.STRING())) \
    .register_table_source('mySource')

t_env.connect(FileSystem().path('../production_data/kafkaoutput')) \
    .with_format(OldCsv()
                 .field('tbd', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field("tbd", DataTypes.STRING())) \
    .register_table_sink('mySink')

t_env.scan('mySource') \
    .insert_into('mySink')

t_env.execute("tutorial_job")

Пример #24

0

Показать файл

Файл: wc.py Проект: annie0sc/practice-flink-wordcount

exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)

input_path = './input/data.json'
output_path = './output/output.txt'
if os.path.exists(output_path):
    os.remove(output_path)

wpre.wcpretreatment(input_path)
input_path = './input'

t_env.connect(FileSystem().path(input_path)) \
    .with_format(OldCsv()
                 .field('dateRep', DataTypes.DATE(True))) \
    .with_schema(Schema()
                 .field('dateRep', DataTypes.DATE(True))) \
    .create_temporary_table('mySource')

t_env.connect(FileSystem().path(output_path)) \
    .with_format(OldCsv()
                 .field_delimiter('\t')
                 .field('dateRep', DataTypes.DATE(True))
                 .field('deaths_weekly', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('dateRep', DataTypes.DATE(True))
                 .field('deaths_weekly', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

t_env.from_path('mySource') \

Пример #25

0

Показать файл

Файл: wordcount.py Проект: imfht/flaskapps

env = StreamExecutionEnvironment.get_execution_environment()
t_env = StreamTableEnvironment.create(
    env,
    environment_settings = EnvironmentSettings.new_instance()
    .use_blink_planner()
    .build(),
)

result_path = '/notebooks/output.csv'

print('Results directory:', result_path)

t_env.connect(FileSystem().path(result_path)).with_format(
    OldCsv()
    .field_delimiter(',')
    .field('word', DataTypes.STRING())
    .field('count', DataTypes.BIGINT())
).with_schema(
    Schema()
    .field('word', DataTypes.STRING())
    .field('count', DataTypes.BIGINT())
).register_table_sink(
    'Results'
)

elements = [(word, 1) for word in content.split(' ')]

t_env.from_elements(elements, ['word', 'count']).group_by('word').select(
    'word, count(1) as count'
).insert_into('Results')

Пример #26

0

Показать файл

Файл: tensorflow_predict.py Проект: MLHafizur/Gather-Deployment

    sentences = [string]

    x = np.zeros((len(sentences), maxlen))
    for i, sentence in enumerate(sentences):
        for no, k in enumerate(sentence.split()[:maxlen][::-1]):
            x[i, -1 - no] = dic.get(k, UNK)
    indices = np.argmax(sess.run(Y, feed_dict={X: x}), axis=1)
    return label[indices[0]]


t_env.set_python_requirements('/notebooks/requirements.txt')

t_env.register_function('predict', predict)

result_path = '/notebooks/output-tensorflow.csv'

t_env.connect(FileSystem().path(result_path)).with_format(
    OldCsv().field_delimiter(',').field('sentence', DataTypes.STRING()).field(
        'label', DataTypes.STRING())).with_schema(Schema().field(
            'sentence', DataTypes.STRING()).field(
                'label', DataTypes.STRING())).create_temporary_table('mySink')

elements = [(sentence, ) for sentence in content.split('\n')]

t_env.from_elements(
    elements,
    ['sentence']).select('sentence, predict(sentence)').insert_into('mySink')

t_env.execute('predict')

Python OldCsv.OldCsv примеры использования