示例#1
0
    def test_register_table_source(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")]
        self.prepare_csv_source(source_path, data, field_types, field_names)
        t_env = self.t_env
        sink_path = os.path.join(self.tempdir + '/streaming2.csv')
        if os.path.isfile(sink_path):
            os.remove(sink_path)
        t_env.register_table_sink(
            "sink",
            field_names, field_types, CsvTableSink(sink_path))

        # connect source
        t_env.connect(FileSystem().path(source_path))\
             .with_format(OldCsv()
                          .field_delimiter(',')
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .with_schema(Schema()
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .register_table_source("source")
        t_env.scan("source") \
             .select("a + 1, b, c") \
             .insert_into("sink")
        t_env.execute()

        with open(sink_path, 'r') as f:
            lines = f.read()
            assert lines == '2,Hi,Hello\n' + '3,Hello,Hello\n'
示例#2
0
    def test_list_tables(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING]
        data = []
        csv_source = self.prepare_csv_source(source_path, data, field_types,
                                             field_names)
        t_env = self.t_env
        t_env.register_table_source("Orders", csv_source)
        tmp_dir = tempfile.gettempdir()
        tmp_csv = tmp_dir + '/streaming2.csv'
        t_env.register_table_sink("Sinks", field_names, field_types,
                                  CsvTableSink(tmp_csv))
        t_env.register_table_sink("Results", field_names, field_types,
                                  CsvTableSink(tmp_csv))

        actual = t_env.list_tables()

        expected = ['Orders', 'Results', 'Sinks']
        self.assert_equals(actual, expected)
示例#3
0
def word_count():
    tmp_dir = tempfile.gettempdir()
    source_path = tmp_dir + '/streaming.csv'
    if os.path.isfile(source_path):
        os.remove(source_path)
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    with open(source_path, 'w') as f:
        for word in content.split(" "):
            f.write(",".join([word, "1"]))
            f.write("\n")
            f.flush()
        f.close()

    t_config = TableConfig.Builder().as_batch_execution().set_parallelism(
        1).build()
    t_env = TableEnvironment.create(t_config)

    field_names = ["word", "cout"]
    field_types = [DataTypes.STRING, DataTypes.LONG]

    # register Orders table in table environment
    t_env.register_table_source(
        "Word", CsvTableSource(source_path, field_names, field_types))

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    tmp_csv = tmp_dir + '/streaming2.csv'
    if os.path.isfile(tmp_csv):
        os.remove(tmp_csv)

    t_env.register_table_sink("Results", field_names, field_types,
                              CsvTableSink(tmp_csv))

    t_env.scan("Word") \
        .group_by("word") \
        .select("word, count(1) as count") \
        .insert_into("Results")

    t_env.execute()
示例#4
0
def test_end_to_end():
    tmp_dir = tempfile.gettempdir()
    source_path = tmp_dir + '/streaming.csv'
    if os.path.isfile(source_path):
        os.remove(source_path)
    with open(source_path, 'w') as f:
        lines = '1,hi,hello\n' + '2,hi,hello\n'
        f.write(lines)
        f.close()
    _find_flink_home()
    print("using %s as FLINK_HOME..." % os.environ["FLINK_HOME"])

    t_config = TableConfig.Builder().as_streaming_execution().set_parallelism(1).build()
    t_env = TableEnvironment.get_table_environment(t_config)

    field_names = ["a", "b", "c"]
    field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING]

    # register Orders table in table environment
    t_env.register_table_source(
        "Orders",
        CsvTableSource(source_path, field_names, field_types))

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    tmp_csv = tmp_dir + '/streaming2.csv'
    if os.path.isfile(tmp_csv):
        os.remove(tmp_csv)

    t_env.register_table_sink(
        "Results",
        field_names, field_types, CsvTableSink(tmp_csv))

    t_env.scan("Orders") \
         .where("a > 0") \
         .select("a + 1, b, c") \
         .insert_into("Results")

    t_env.execute()
    with open(tmp_csv, 'r') as f:
        lines = f.read()
        assert lines == '2,hi,hello\n' + '3,hi,hello\n'
    print("test passed, the log file is under this directory: %s/log" % os.environ["FLINK_HOME"])
示例#5
0
    def test_sql_update(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING]
        data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types,
                                             field_names)
        t_env = self.t_env
        t_env.register_table_source("source", csv_source)
        tmp_dir = tempfile.gettempdir()
        tmp_csv = tmp_dir + '/streaming2.csv'
        if os.path.isfile(tmp_csv):
            os.remove(tmp_csv)
        t_env.register_table_sink("sinks", field_names, field_types,
                                  CsvTableSink(tmp_csv))

        t_env.sql_update("insert into sinks select * from source")
        t_env.execute("test_sql_job")

        with open(tmp_csv, 'r') as f:
            lines = f.read()
            assert lines == '1,Hi,Hello\n' + '2,Hello,Hello\n'
示例#6
0
    def test_register_table_source_sink(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING]
        data = [(1, "Hi", "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types,
                                             field_names)
        t_env = self.t_env
        tmp_dir = tempfile.gettempdir()
        tmp_csv = tmp_dir + '/streaming2.csv'
        if os.path.isfile(tmp_csv):
            os.remove(tmp_csv)

        t_env.register_table_source("Orders", csv_source)
        t_env.register_table_sink("Results", field_names, field_types,
                                  CsvTableSink(tmp_csv))
        t_env.scan("Orders").insert_into("Results")
        t_env.execute()

        with open(tmp_csv, 'r') as f:
            lines = f.read()
            assert lines == '1,Hi,Hello\n'
示例#7
0
    def test_sql_query(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING]
        data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types,
                                             field_names)
        t_env = self.t_env
        t_env.register_table_source("Source", csv_source)
        source = t_env.scan("Source")
        tmp_dir = tempfile.gettempdir()
        tmp_csv = tmp_dir + '/streaming2.csv'
        if os.path.isfile(tmp_csv):
            os.remove(tmp_csv)
        t_env.register_table_sink("sinks", field_names, field_types,
                                  CsvTableSink(tmp_csv))

        result = t_env.sql_query("select a + 1, b, c from %s" % source)
        result.insert_into("sinks")
        t_env.execute()

        with open(tmp_csv, 'r') as f:
            lines = f.read()
            assert lines == '2,Hi,Hello\n' + '3,Hello,Hello\n'