def setUp(self): super(PyFlinkStreamTableTestCase, self).setUp() self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_old_planner().build())
def setUp(self): super(PyFlinkBlinkStreamTableTestCase, self).setUp() self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance() .in_streaming_mode().use_blink_planner().build()) self.t_env.get_config().get_configuration().set_string( "taskmanager.memory.task.off-heap.size", "80mb")
def setUp(self): super(PyFlinkBlinkStreamTableTestCase, self).setUp() self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance() .in_streaming_mode().use_blink_planner().build()) self.t_env.get_config().get_configuration().set_string( "python.fn-execution.bundle.size", "1")
def input_output_table(): stream_env = StreamExecutionEnvironment.get_execution_environment() table_env = StreamTableEnvironment.create(stream_env) statement_set = table_env.create_statement_set() work_num = 2 ps_num = 1 python_file = os.getcwd() + "/../../src/test/python/input_output.py" prop = {} func = "map_func" env_path = None prop[MLCONSTANTS.ENCODING_CLASS] = "com.alibaba.flink.ml.operator.coding.RowCSVCoding" prop[MLCONSTANTS.DECODING_CLASS] = "com.alibaba.flink.ml.operator.coding.RowCSVCoding" inputSb = "INT_32" + "," + "INT_64" + "," + "FLOAT_32" + "," + "FLOAT_64" + "," + "STRING" prop["sys:csv_encode_types"] = inputSb prop["sys:csv_decode_types"] = inputSb prop[MLCONSTANTS.PYTHON_VERSION] = "3.7" source_file = os.getcwd() + "/../../src/test/resources/input.csv" sink_file = os.getcwd() + "/../../src/test/resources/output.csv" table_source = CsvTableSource(source_file, ["a", "b", "c", "d", "e"], [DataTypes.INT(), DataTypes.BIGINT(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.STRING()]) table_env.register_table_source("source", table_source) input_tb = table_env.from_path("source") output_schema = TableSchema(["a", "b", "c", "d", "e"], [DataTypes.INT(), DataTypes.BIGINT(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.STRING()] ) sink = CsvTableSink(["a", "b", "c", "d", "e"], [DataTypes.INT(), DataTypes.BIGINT(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.STRING()], sink_file, write_mode=WriteMode.OVERWRITE) table_env.register_table_sink("table_row_sink", sink) tf_config = TFConfig(work_num, ps_num, prop, python_file, func, env_path) output_table = train(stream_env, table_env, statement_set, input_tb, tf_config, output_schema) # output_table = inference(stream_env, table_env, statement_set, input_tb, tf_config, output_schema) statement_set.add_insert("table_row_sink", output_table) job_client = statement_set.execute().get_job_client() if job_client is not None: job_client.get_job_execution_result(user_class_loader=None).result()
def get_stream_table_environment(self) -> StreamTableEnvironment: """ Get the StreamTableEnvironment. If the StreamTableEnvironment has not been set, it initial the StreamTableEnvironment with default Configuration. :return: the StreamTableEnvironment. .. versionadded:: 1.11.0 """ if self._stream_tab_env is None: self._stream_tab_env = StreamTableEnvironment.create( StreamExecutionEnvironment.get_execution_environment()) return self._stream_tab_env
def addTrainStream(): stream_env = StreamExecutionEnvironment.get_execution_environment() work_num = 2 ps_num = 1 python_file = os.getcwd() + "/../../src/test/python/add.py" func = "map_func" property = None env_path = None zk_conn = None zk_base_path = None input_ds = None output_row_type = None train(work_num, ps_num, python_file, func, property, env_path, zk_conn, zk_base_path, stream_env, input_ds, output_row_type)
def addTrainTable(): stream_env = StreamExecutionEnvironment.get_execution_environment() table_env = StreamTableEnvironment.create(stream_env) work_num = 2 ps_num = 1 python_file = os.getcwd() + "/../../src/test/python/add.py" func = "map_func" property = None env_path = None zk_conn = None zk_base_path = None input_tb = None output_schema = None train(work_num, ps_num, python_file, func, property, env_path, zk_conn, zk_base_path, stream_env, table_env, input_tb, output_schema)
def addTrainChiefAloneStream(): stream_env = StreamExecutionEnvironment.get_execution_environment() work_num = 2 ps_num = 1 python_file = os.getcwd() + "/../../src/test/python/add.py" func = "map_func" property = {} property[TFCONSTANS.TF_IS_CHIEF_ALONE] = "true" env_path = None zk_conn = None zk_base_path = None input_ds = None output_row_type = None train(work_num, ps_num, python_file, func, property, env_path, zk_conn, zk_base_path, stream_env, input_ds, output_row_type)
def fit(self, *inputs: Table) -> 'TensorflowModel': if len(inputs) == 0: if self.table_env is None: raise RuntimeError( "table_env should not be None if inputs is not given") input_table = None t_env = self.table_env else: input_table = inputs[0] t_env = input_table._t_env statement_set = self.statement_set if self.statement_set \ else t_env.create_statement_set() env = StreamExecutionEnvironment(t_env._j_tenv.execEnv()) train(env, t_env, statement_set, input_table, self.tf_config) return TensorflowModel(self.tf_config, statement_set, self.predict_col_names, self.predict_data_types)
def worker_zero_finish(): stream_env = StreamExecutionEnvironment.get_execution_environment() table_env = StreamTableEnvironment.create(stream_env) work_num = 3 ps_num = 2 python_file = os.getcwd() + "/../../src/test/python/worker_0_finish.py" func = "map_func" prop = {MLCONSTANTS.PYTHON_VERSION: '3.7'} env_path = None input_tb = None output_schema = None tf_config = TFConfig(work_num, ps_num, prop, python_file, func, env_path) train(stream_env, table_env, input_tb, tf_config, output_schema) # inference(stream_env, table_env, input_tb, tf_config, output_schema) table_env.execute("train")
def add_train_chief_alone_table(): stream_env = StreamExecutionEnvironment.get_execution_environment() table_env = StreamTableEnvironment.create(stream_env) work_num = 2 ps_num = 1 python_file = os.getcwd() + "/../../src/test/python/add.py" func = "map_func" prop = {} prop[TFCONSTANS.TF_IS_CHIEF_ALONE] = "true" prop[MLCONSTANTS.PYTHON_VERSION] = "3.7" env_path = None input_tb = None output_schema = None tf_config = TFConfig(work_num, ps_num, prop, python_file, func, env_path) train(stream_env, table_env, input_tb, tf_config, output_schema) # inference(stream_env, table_env, input_tb, tf_config, output_schema) table_env.execute("train")
def get_default() -> Optional[MLEnvironment]: """ Get the MLEnvironment use the default MLEnvironmentId. :return: the default MLEnvironment. .. versionadded:: 1.11.0 """ with MLEnvironmentFactory._lock: if MLEnvironmentFactory._map[MLEnvironmentFactory._default_ml_environment_id] is None: j_ml_env = get_gateway().\ jvm.org.apache.flink.ml.common.MLEnvironmentFactory.getDefault() ml_env = MLEnvironment( ExecutionEnvironment(j_ml_env.getExecutionEnvironment()), StreamExecutionEnvironment(j_ml_env.getStreamExecutionEnvironment()), BatchTableEnvironment(j_ml_env.getBatchTableEnvironment()), StreamTableEnvironment(j_ml_env.getStreamTableEnvironment())) MLEnvironmentFactory._map[MLEnvironmentFactory._default_ml_environment_id] = ml_env return MLEnvironmentFactory._map[MLEnvironmentFactory._default_ml_environment_id]
def worker_zero_finish(): stream_env = StreamExecutionEnvironment.get_execution_environment() table_env = StreamTableEnvironment.create(stream_env) statement_set = table_env.create_statement_set() work_num = 3 ps_num = 2 python_file = os.getcwd() + "/../../src/test/python/worker_0_finish.py" func = "map_func" prop = {MLCONSTANTS.PYTHON_VERSION: '3.7'} env_path = None input_tb = None output_schema = None tf_config = TFConfig(work_num, ps_num, prop, python_file, func, env_path) train(stream_env, table_env, statement_set, input_tb, tf_config, output_schema) # inference(stream_env, table_env, statement_set, input_tb, tf_config, output_schema) job_client = statement_set.execute().get_job_client() if job_client is not None: job_client.get_job_execution_result(user_class_loader=None).result()
def inputOutputTable(): stream_env = StreamExecutionEnvironment.get_execution_environment() table_env = StreamTableEnvironment.create(stream_env) work_num = 2 ps_num = 1 python_file = os.getcwd() + "/../../src/test/python/input_output.py" property = {} func = "map_func" env_path = None zk_conn = None zk_base_path = None property[ MLCONSTANTS. ENCODING_CLASS] = "com.alibaba.flink.ml.operator.coding.RowCSVCoding" property[ MLCONSTANTS. DECODING_CLASS] = "com.alibaba.flink.ml.operator.coding.RowCSVCoding" inputSb = "INT_32" + "," + "INT_64" + "," + "FLOAT_32" + "," + "FLOAT_64" + "," + "STRING" property["SYS:csv_encode_types"] = inputSb property["SYS:csv_decode_types"] = inputSb source_file = os.getcwd() + "/../../src/test/resources/input.csv" table_source = CsvTableSource(source_file, ["a", "b", "c", "d", "e"], [ DataTypes.INT(), DataTypes.INT(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.STRING() ]) table_env.register_table_source("source", table_source) input_tb = table_env.scan("source") output_schema = TableSchema(["a", "b", "c", "d", "e"], [ DataTypes.INT(), DataTypes.INT(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.STRING() ]) train(work_num, ps_num, python_file, func, property, env_path, zk_conn, zk_base_path, stream_env, table_env, input_tb, output_schema)
def inference(num_worker, num_ps=0, python_file=None, func=None, properties=None, env_path=None, zk_conn=None, zk_base_path=None, stream_env=None, table_env=None, input_table=None, output_schema=None): """ Tensorflow inference for Table :param num_worker: Number of workers :param num_ps: Number of PS :param python_file: The python file which is going to be run :param func: The user-defined function that runs TF inference. If it's None, inference is run via Java API. :param properties: User-defined properties :param env_path: Path to the virtual env :param stream_env: The StreamExecutionEnvironment. If it's None, this method will create one and execute the job at the end. Otherwise, caller is responsible to trigger the job execution :param table_env: The TableEnvironment :param zk_conn: The Zookeeper connection string :param zk_base_path: The Zookeeper base path :param input_table: The input Table :param output_schema: The TableSchema of the output Table. If it's None, a dummy sink will be added to the output Table. Otherwise, caller is responsible to add sink before executing the job. :return: The output Table """ tf_config = TFConfig(num_worker, num_ps, python_file, func, properties, env_path, zk_conn, zk_base_path) if stream_env is None: stream_env = StreamExecutionEnvironment.get_execution_environment() if table_env is None: table_env = StreamTableEnvironment.create(stream_env) if input_table is not None: input_table = input_table._j_table if output_schema is not None: output_schema = output_schema._j_table_schema output_table = get_gateway().jvm.com.alibaba.flink.ml.tensorflow.client.TFUtils.inference( stream_env._j_stream_execution_environment, table_env._j_tenv, input_table, tf_config.java_config(), output_schema) table_env.execute(job_name="table inference") return Table(output_table)
def add_train_chief_alone_table(): stream_env = StreamExecutionEnvironment.get_execution_environment() table_env = StreamTableEnvironment.create(stream_env) statement_set = table_env.create_statement_set() work_num = 2 ps_num = 1 python_file = os.getcwd() + "/../../src/test/python/add.py" func = "map_func" prop = {} prop[TFCONSTANS.TF_IS_CHIEF_ALONE] = "true" prop[MLCONSTANTS.PYTHON_VERSION] = "3.7" env_path = None input_tb = None output_schema = None tf_config = TFConfig(work_num, ps_num, prop, python_file, func, env_path) train(stream_env, table_env, statement_set, input_tb, tf_config, output_schema) # inference(stream_env, table_env, statement_set, input_tb, tf_config, output_schema) job_client = statement_set.execute().get_job_client() if job_client is not None: job_client.get_job_execution_result(user_class_loader=None).result()
def setUp(self): super(PyFlinkBatchTestCase, self).setUp() self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.env.set_runtime_mode(RuntimeExecutionMode.BATCH)
def setUp(self): super(PyFlinkStreamingTestCase, self).setUp() self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.env.set_runtime_mode(RuntimeExecutionMode.STREAMING) self.env._remote_mode = True