def execute_with_table_execute_insert(self, t_env): source = t_env.from_elements([(1, "Hi"), (2, "Hello")], ["a", "b"]) result = source.select("func1(a, b), func2(a, b)") exec_insert_table(result, "sink") actual = source_sink_utils.results() expected = ['1 and Hi,1 or Hi', '2 and Hello,2 or Hello'] self.assert_equals(actual, expected)
def test_stream_case(self): from pyflink.shell import s_env, st_env, FileSystem, OldCsv, DataTypes, Schema # example begin import tempfile import os import shutil sink_path = tempfile.gettempdir() + '/streaming.csv' if os.path.exists(sink_path): if os.path.isfile(sink_path): os.remove(sink_path) else: shutil.rmtree(sink_path) s_env.set_parallelism(1) t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c']) st_env.connect(FileSystem().path(sink_path))\ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.BIGINT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("a", DataTypes.BIGINT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .create_temporary_table("stream_sink") exec_insert_table(t.select("a + 1, b, c"), "stream_sink") # verify code, do not copy these code to shell.py with open(sink_path, 'r') as f: lines = f.read() self.assertEqual(lines, '2,hi,hello\n' + '3,hi,hello\n')
def test_table_environment_with_blink_planner(self): t_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).in_batch_mode().use_blink_planner().build()) source_path = os.path.join(self.tempdir + '/streaming.csv') sink_path = os.path.join(self.tempdir + '/results') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, 'hi', 'hello'), (2, 'hello', 'hello')] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env.register_table_source("source", csv_source) t_env.register_table_sink( "sink", CsvTableSink(field_names, field_types, sink_path)) source = t_env.from_path("source") result = source.alias("a, b, c").select("1 + a, b, c") exec_insert_table(result, "sink") results = [] for root, dirs, files in os.walk(sink_path): for sub_file in files: with open(os.path.join(root, sub_file), 'r') as f: line = f.readline() while line is not None and line != '': results.append(line) line = f.readline() self.assert_equals(results, ['2,hi,hello\n', '3,hello,hello\n'])
def test_add_python_file(self): python_file_dir = os.path.join(self.tempdir, "python_file_dir_" + str(uuid.uuid4())) os.mkdir(python_file_dir) python_file_path = os.path.join(python_file_dir, "test_dependency_manage_lib.py") with open(python_file_path, 'w') as f: f.write("def add_two(a):\n return a + 2") self.t_env.add_python_file(python_file_path) def plus_two(i): from test_dependency_manage_lib import add_two return add_two(i) self.t_env.create_temporary_system_function( "add_two", udf(plus_two, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) exec_insert_table(t.select(expr.call("add_two", t.a), t.a), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["3,1", "4,2", "5,3"])
def test_data_types_only_supported_in_blink_planner(self): import pandas as pd timezone = self.t_env.get_config().get_local_timezone() local_datetime = pytz.timezone(timezone).localize( datetime.datetime(1970, 1, 2, 0, 0, 0, 123000)) def local_zoned_timestamp_func(local_zoned_timestamp_param): assert isinstance(local_zoned_timestamp_param, pd.Series) assert isinstance(local_zoned_timestamp_param[0], datetime.datetime), \ 'local_zoned_timestamp_param of wrong type %s !' % type( local_zoned_timestamp_param[0]) assert local_zoned_timestamp_param[0] == local_datetime, \ 'local_zoned_timestamp_param is wrong value %s, %s!' % \ (local_zoned_timestamp_param[0], local_datetime) return local_zoned_timestamp_param self.t_env.create_temporary_system_function( "local_zoned_timestamp_func", udf(local_zoned_timestamp_func, result_type=DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3), udf_type="pandas")) table_sink = source_sink_utils.TestAppendSink( ['a'], [DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3)]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements( [(local_datetime,)], DataTypes.ROW([DataTypes.FIELD("a", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3))])) exec_insert_table(t.select("local_zoned_timestamp_func(local_zoned_timestamp_func(a))"), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["1970-01-02T00:00:00.123Z"])
def test_add_python_archive(self): tmp_dir = self.tempdir archive_dir_path = os.path.join(tmp_dir, "archive_" + str(uuid.uuid4())) os.mkdir(archive_dir_path) with open(os.path.join(archive_dir_path, "data.txt"), 'w') as f: f.write("2") archive_file_path = \ shutil.make_archive(os.path.dirname(archive_dir_path), 'zip', archive_dir_path) self.t_env.add_python_archive(archive_file_path, "data") def add_from_file(i): with open("data/data.txt", 'r') as f: return i + int(f.read()) self.t_env.create_temporary_system_function( "add_from_file", udf(add_from_file, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) exec_insert_table(t.select(expr.call('add_from_file', t.a), t.a), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["3,1", "4,2", "5,3"])
def test_basic_functionality(self): # pandas UDF self.t_env.create_temporary_system_function( "add_one", udf(lambda i: i + 1, result_type=DataTypes.BIGINT(), udf_type="pandas")) self.t_env.create_temporary_system_function("add", add) # general Python UDF self.t_env.create_temporary_system_function( "subtract_one", udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c', 'd'], [ DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT() ]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) exec_insert_table( t.where("add_one(b) <= 3").select( "a, b + 1, add(a + 1, subtract_one(c)) + 2, " "add(add_one(a), 1L)"), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["1,3,6,3", "3,2,14,5"])
def test_set_requirements_without_cached_directory(self): requirements_txt_path = os.path.join(self.tempdir, str(uuid.uuid4())) with open(requirements_txt_path, 'w') as f: f.write("cloudpickle==1.2.2") self.t_env.set_python_requirements(requirements_txt_path) def check_requirements(i): import cloudpickle assert os.path.abspath(cloudpickle.__file__).startswith( os.environ['_PYTHON_REQUIREMENTS_INSTALL_DIR']) return i self.t_env.create_temporary_system_function( "check_requirements", udf(check_requirements, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) exec_insert_table(t.select(expr.call('check_requirements', t.a), t.a), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["1,1", "2,2", "3,3"])
def test_pipeline(self): t_env = MLEnvironmentFactory().get_default( ).get_stream_table_environment() train_table = t_env.from_elements([(1, 2), (1, 4), (1, 0), (10, 2), (10, 4), (10, 0)], ['a', 'b']) serving_table = t_env.from_elements([(0, 0), (12, 3)], ['a', 'b']) table_sink = source_sink_utils.TestAppendSink(['predict_result'], [DataTypes.BOOLEAN()]) t_env.register_table_sink("PredictResults", table_sink) # transformer, output features column which is the sum of a and b. transformer = PythonAddTransformer(selected_cols=["a", "b"], output_col="features") # estimator estimator = PythonEstimator()\ .set_vector_col("features")\ .set_prediction_col("predict_result") # pipeline pipeline = Pipeline().append_stage(transformer).append_stage(estimator) exec_insert_table( pipeline.fit(t_env, train_table).transform(t_env, serving_table), 'PredictResults') actual = source_sink_utils.results() # the first input is false since 0 + 0 is smaller than the max_sum 14. # the second input is true since 12 + 3 is bigger than the max_sum 14. self.assert_equals(actual, ["false", "true"])
def test_data_types_only_supported_in_blink_planner(self): timezone = self.t_env.get_config().get_local_timezone() local_datetime = pytz.timezone(timezone).localize( datetime.datetime(1970, 1, 1, 0, 0, 0, 123000)) @udf(result_type=DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3)) def local_zoned_timestamp_func(local_zoned_timestamp_param): assert local_zoned_timestamp_param == local_datetime, \ 'local_zoned_timestamp_param is wrong value %s !' % local_zoned_timestamp_param return local_zoned_timestamp_param table_sink = source_sink_utils.TestAppendSink( ['a'], [DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3)]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements( [(local_datetime, )], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3)) ])) exec_insert_table( t.select( local_zoned_timestamp_func(local_zoned_timestamp_func(t.a))), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["1970-01-01T00:00:00.123Z"])
def test_pipeline_from_and_to_java_json(self): # json generated from Java api java_json = '[{"stageClassName":"org.apache.flink.ml.pipeline.' \ 'UserDefinedPipelineStages$SelectColumnTransformer",' \ '"stageJson":"{\\"selectedCols\\":\\"[\\\\\\"a\\\\\\",' \ '\\\\\\"b\\\\\\"]\\"}"}]' # load json p = Pipeline() p.load_json(java_json) python_json = p.to_json() t_env = MLEnvironmentFactory().get_default( ).get_stream_table_environment() table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) t_env.register_table_sink("TestJsonResults", table_sink) source_table = t_env.from_elements([(1, 2, 3, 4), (4, 3, 2, 1)], ['a', 'b', 'c', 'd']) transformer = p.get_stages()[0] exec_insert_table(transformer.transform(t_env, source_table), "TestJsonResults") actual = source_sink_utils.results() self.assert_equals(actual, ["1,2", "4,3"]) self.assertEqual(python_json, java_json)
def test_table_environment_with_blink_planner(self): self.env.set_parallelism(1) t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance( ).use_blink_planner().build()) source_path = os.path.join(self.tempdir + '/streaming.csv') sink_path = os.path.join(self.tempdir + '/result.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, 'hi', 'hello'), (2, 'hello', 'hello')] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env.register_table_source("source", csv_source) t_env.register_table_sink( "sink", CsvTableSink(field_names, field_types, sink_path)) source = t_env.from_path("source") result = source.alias("a, b, c").select("1 + a, b, c") exec_insert_table(result, "sink") results = [] with open(sink_path, 'r') as f: results.append(f.readline()) results.append(f.readline()) self.assert_equals(results, ['2,hi,hello\n', '3,hello,hello\n'])
def load_model(self, table_env): """ Train the model to get the max_sum value which is used to predict data. """ table_sink = source_sink_utils.TestRetractSink(["max_sum"], [DataTypes.BIGINT()]) table_env.register_table_sink("Model_Results", table_sink) exec_insert_table(self._model_data_table, "Model_Results") actual = source_sink_utils.results() self.max_sum = actual.apply(0)
def test_from_element(self): t_env = self.t_env field_names = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r" ] field_types = [ DataTypes.BIGINT(), DataTypes.DOUBLE(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.DATE(), DataTypes.TIME(), DataTypes.TIMESTAMP(3), DataTypes.INTERVAL(DataTypes.SECOND(3)), DataTypes.ARRAY(DataTypes.DOUBLE()), DataTypes.ARRAY(DataTypes.DOUBLE(False)), DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.DATE()), DataTypes.DECIMAL(38, 18), DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.DOUBLE()) ]), DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()), DataTypes.BYTES(), ExamplePointUDT(), PythonOnlyUDT() ] schema = DataTypes.ROW( list( map( lambda field_name, field_type: DataTypes.FIELD( field_name, field_type), field_names, field_types))) table_sink = source_sink_utils.TestAppendSink(field_names, field_types) t_env.register_table_sink("Results", table_sink) t = t_env.from_elements( [(1, 1.0, "hi", "hello", datetime.date(1970, 1, 2), datetime.time(1, 0, 0), datetime.datetime(1970, 1, 2, 0, 0), datetime.timedelta(days=1, microseconds=10), [1.0, None], array.array("d", [1.0, 2.0]), ["abc"], [datetime.date(1970, 1, 2)], Decimal(1), Row("a", "b")(1, 2.0), { "key": 1.0 }, bytearray(b'ABCD'), ExamplePoint( 1.0, 2.0), PythonOnlyPoint(3.0, 4.0))], schema) exec_insert_table(t, "Results") actual = source_sink_utils.results() expected = [ '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,' '86400000,[1.0, null],[1.0, 2.0],[abc],[1970-01-02],' '1,1,2.0,{key=1.0},[65, 66, 67, 68],[1.0, 2.0],[3.0, 4.0]' ] self.assert_equals(actual, expected)
def test_open(self): self.t_env.get_config().get_configuration().set_string('python.metric.enabled', 'true') self.t_env.create_temporary_system_function( "subtract", udf(Subtract(), result_type=DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 4)], ['a', 'b']) exec_insert_table(t.select("a, subtract(b)"), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["1,1", "2,4", "3,3"])
def test_overwrite_builtin_function(self): self.t_env.create_temporary_system_function( "plus", udf(lambda i, j: i + j - 1, result_type=DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink(['a'], [DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) exec_insert_table(t.select("plus(a, b)"), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["2", "6", "3"])
def test_insert_into(self): t_env = self.t_env field_names = ["a", "b", "c"] field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()] t_env.register_table_sink( "Sinks", source_sink_utils.TestAppendSink(field_names, field_types)) exec_insert_table(t_env.from_elements([(1, "Hi", "Hello")], ["a", "b", "c"]), "Sinks") actual = source_sink_utils.results() expected = ['1,Hi,Hello'] self.assert_equals(actual, expected)
def test_set_requirements_with_cached_directory(self): tmp_dir = self.tempdir requirements_txt_path = os.path.join( tmp_dir, "requirements_txt_" + str(uuid.uuid4())) with open(requirements_txt_path, 'w') as f: f.write("python-package1==0.0.0") requirements_dir_path = os.path.join( tmp_dir, "requirements_dir_" + str(uuid.uuid4())) os.mkdir(requirements_dir_path) package_file_name = "python-package1-0.0.0.tar.gz" with open(os.path.join(requirements_dir_path, package_file_name), 'wb') as f: import base64 # This base64 data is encoded from a python package file which includes a # "python_package1" module. The module contains a "plus(a, b)" function. # The base64 can be recomputed by following code: # base64.b64encode(open("python-package1-0.0.0.tar.gz", "rb").read()).decode("utf-8") f.write( base64.b64decode( "H4sICNefrV0C/2Rpc3QvcHl0aG9uLXBhY2thZ2UxLTAuMC4wLnRhcgDtmVtv2jAYhnPtX2H1CrRCY+ckI" "XEx7axuUA11u5imyICTRc1JiVnHfv1MKKWjYxwKEdPehws7xkmUfH5f+3PyqfqWpa1cjG5EKFnLbOvfhX" "FQTI3nOPPSdavS5Pa8nGMwy3Esi3ke9wyTObbnGNQxamBSKlFQavzUryG8ldG6frpbEGx4yNmDLMp/hPy" "P8b+6fNN613vdP1z8XdteG3+ug/17/F3Hcw1qIv5H54NUYiyUaH2SRRllaYeytkl6IpEdujI2yH2XapCQ" "wSRJRDHt0OveZa//uUfeZonUvUO5bHo+0ZcoVo9bMhFRvGx9H41kWj447aUsR0WUq+pui8arWKggK5Jli" "wGOo/95q79ovXi6/nfyf246Dof/n078fT9KI+X77Xx6BP83bX4Xf5NxT7dz7toO/L8OxjKgeTwpG+KcDp" "sdQjWFVJMipYI+o0MCk4X/t2UYtqI0yPabCHb3f861XcD/Ty/+Y5nLdCzT0dSPo/SmbKsf6un+b7KV+Ls" "W4/D/OoC9w/930P9eGwM75//csrD+Q/6P/P/k9D/oX3988Wqw1bS/tf6tR+s/m3EG/ddBqXO9XKf15C8p" "P9k4HZBtBgzZaVW5vrfKcj+W32W82ygEB9D/Xu9+4/qfP9L/rBv0X1v87yONKRX61/qfzwqjIDzIPTbv/" "7or3/88i0H/tfBFW7s/s/avRInQH06ieEy7tDrQeYHUdRN7wP+n/vf62LOH/pld7f9xz7a5Pfufedy0oP" "86iJI8KxStAq6yLC4JWdbbVbWRikR2z1ZGytk5vauW3QdnBFE6XqwmykazCesAAAAAAAAAAAAAAAAAAAA" "AAAAAAAAAAAAAAOBw/AJw5CHBAFAAAA==")) self.t_env.set_python_requirements(requirements_txt_path, requirements_dir_path) def add_one(i): from python_package1 import plus return plus(i, 1) self.t_env.create_temporary_system_function( "add_one", udf(add_one, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) exec_insert_table(t.select(expr.call('add_one', t.a), t.a), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["2,1", "3,2", "4,3"])
def test_udf_without_arguments(self): self.t_env.create_temporary_system_function("one", udf( lambda: 1, result_type=DataTypes.BIGINT(), deterministic=True)) self.t_env.create_temporary_system_function("two", udf( lambda: 2, result_type=DataTypes.BIGINT(), deterministic=False)) table_sink = source_sink_utils.TestAppendSink(['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) exec_insert_table(t.select("one(), two()"), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["1,2", "1,2", "1,2"])
def test_sql_query(self): t_env = self.t_env source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"]) field_names = ["a", "b", "c"] field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()] t_env.register_table_sink( "sinks", source_sink_utils.TestAppendSink(field_names, field_types)) result = t_env.sql_query("select a + 1, b, c from %s" % source) exec_insert_table(result, "sinks") actual = source_sink_utils.results() expected = ['2,Hi,Hello', '3,Hello,Hello'] self.assert_equals(actual, expected)
def test_udf_in_join_condition(self): t1 = self.t_env.from_elements([(2, "Hi")], ['a', 'b']) t2 = self.t_env.from_elements([(2, "Flink")], ['c', 'd']) self.t_env.create_temporary_system_function("f", udf(lambda i: i, result_type=DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd'], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.BIGINT(), DataTypes.STRING()]) self.t_env.register_table_sink("Results", table_sink) exec_insert_table(t1.join(t2).where("f(a) = c"), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["2,Hi,2,Flink"])
def test_java_transformer(self): t_env = MLEnvironmentFactory().get_default( ).get_stream_table_environment() table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) t_env.register_table_sink("TransformerResults", table_sink) source_table = t_env.from_elements([(1, 2, 3, 4), (4, 3, 2, 1)], ['a', 'b', 'c', 'd']) transformer = WrapperTransformer(selected_cols=["a", "b"]) exec_insert_table(transformer.transform(t_env, source_table), "TransformerResults") actual = source_sink_utils.results() self.assert_equals(actual, ["1,2", "4,3"])
def test_chaining_scalar_function(self): self.t_env.create_temporary_system_function( "add_one", udf(lambda i: i + 1, result_type=DataTypes.BIGINT())) self.t_env.create_temporary_system_function( "subtract_one", udf(SubtractOne(), result_type=DataTypes.BIGINT())) self.t_env.create_temporary_system_function("add", add) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.INT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c']) exec_insert_table(t.select("add(add_one(a), subtract_one(b)), c, 1"), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["3,1,1", "7,2,1", "4,3,1"])
def test_from_pandas(self): table = self.t_env.from_pandas(self.pdf, self.data_type, 5) self.assertEqual(self.data_type, table.get_schema().to_row_data_type()) table = table.filter("f2 < 2") table_sink = source_sink_utils.TestAppendSink( self.data_type.field_names(), self.data_type.field_types()) self.t_env.register_table_sink("Results", table_sink) exec_insert_table(table, "Results") actual = source_sink_utils.results() self.assert_equals(actual, [ "1,1,1,1,true,1.1,1.2,hello,[97, 97, 97]," "1000000000000000000.010000000000000000,2014-09-13,01:00:01," "1970-01-01 00:00:00.123,[hello, 中文],1,hello," "1970-01-01 00:00:00.123,[1, 2]" ])
def test_get_execution_plan(self): tmp_dir = tempfile.gettempdir() source_path = os.path.join(tmp_dir + '/streaming.csv') tmp_csv = os.path.join(tmp_dir + '/streaming2.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] t_env = StreamTableEnvironment.create(self.env) csv_source = CsvTableSource(source_path, field_names, field_types) t_env.register_table_source("Orders", csv_source) t_env.register_table_sink( "Results", CsvTableSink(field_names, field_types, tmp_csv)) exec_insert_table(t_env.from_path("Orders"), "Results") plan = self.env.get_execution_plan() json.loads(plan)
def test_set_environment(self): python_exec = sys.executable tmp_dir = self.tempdir python_exec_link_path = os.path.join(tmp_dir, "py_exec") os.symlink(python_exec, python_exec_link_path) self.t_env.get_config().set_python_executable(python_exec_link_path) def check_python_exec(i): import os assert os.environ["python"] == python_exec_link_path return i self.t_env.create_temporary_system_function( "check_python_exec", udf(check_python_exec, DataTypes.BIGINT(), DataTypes.BIGINT())) def check_pyflink_gateway_disabled(i): try: from pyflink.java_gateway import get_gateway get_gateway() except Exception as e: assert str(e).startswith( "It's launching the PythonGatewayServer during Python UDF" " execution which is unexpected.") else: raise Exception("The gateway server is not disabled!") return i self.t_env.create_temporary_system_function( "check_pyflink_gateway_disabled", udf(check_pyflink_gateway_disabled, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) exec_insert_table( t.select(expr.call('check_python_exec', t.a), expr.call('check_pyflink_gateway_disabled', t.a)), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["1,1", "2,2", "3,3"])
def test_chaining_scalar_function(self): add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT()) subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT()) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.INT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c']) exec_insert_table( t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["3,1,1", "7,2,1", "4,3,1"])
def test_scalar_function(self): # test metric disabled. self.t_env.get_config().get_configuration().set_string( 'python.metric.enabled', 'false') # test lambda function add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT()) # test Python ScalarFunction subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT()) # test callable function add_one_callable = udf(CallablePlus(), result_type=DataTypes.BIGINT()) def partial_func(col, param): return col + param # test partial function import functools add_one_partial = udf(functools.partial(partial_func, param=1), result_type=DataTypes.BIGINT()) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd', 'e', 'f'], [ DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT() ]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) exec_insert_table( t.where(add_one(t.b) <= 3).select(add_one(t.a), subtract_one(t.b), add(t.a, t.c), add_one_callable(t.a), add_one_partial(t.a), t.a), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["2,1,4,2,2,1", "4,0,12,4,4,3"])
def test_register_temporary_table(self): self.env.set_parallelism(1) source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] self.prepare_csv_source(source_path, data, field_types, field_names) sink_path = os.path.join(self.tempdir + '/streaming2.csv') if os.path.isfile(sink_path): os.remove(sink_path) t_env = self.t_env t_env.connect(FileSystem().path(source_path))\ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .create_temporary_table("source") t_env.connect(FileSystem().path(sink_path))\ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .create_temporary_table("sink") exec_insert_table( t_env.from_path("source").select("a + 1, b, c"), "sink") with open(sink_path, 'r') as f: lines = f.read() assert lines == '2,Hi,Hello\n' + "3,Hello,Hello\n"
def test_execute(self): tmp_dir = tempfile.gettempdir() field_names = ['a', 'b', 'c'] field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()] t_env = StreamTableEnvironment.create(self.env) t_env.register_table_sink( 'Results', CsvTableSink(field_names, field_types, os.path.join('{}/{}.csv'.format(tmp_dir, round(time.time()))))) execution_result = exec_insert_table( t_env.from_elements([(1, 'Hi', 'Hello')], ['a', 'b', 'c']), 'Results') self.assertIsNotNone(execution_result.get_job_id()) self.assertIsNotNone(execution_result.get_net_runtime()) self.assertEqual(len(execution_result.get_all_accumulator_results()), 0) self.assertIsNone(execution_result.get_accumulator_result('accumulator')) self.assertIsNotNone(str(execution_result))