def _get_connected_stream_operator(self, func: Union[Function, FunctionWrapper], type_info: TypeInformation, func_name: str, func_type: int): gateway = get_gateway() import cloudpickle serialized_func = cloudpickle.dumps(func) j_input_types1 = self.stream1._j_data_stream.getTransformation().getOutputType() j_input_types2 = self.stream2._j_data_stream.getTransformation().getOutputType() if type_info is None: output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO() else: if isinstance(type_info, list): output_type_info = RowTypeInfo(type_info) else: output_type_info = type_info DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \ .python.DataStreamPythonFunction j_python_data_stream_scalar_function = DataStreamPythonFunction( func_name, bytearray(serialized_func), _get_python_env()) DataStreamPythonFunctionInfo = gateway.jvm. \ org.apache.flink.datastream.runtime.functions.python \ .DataStreamPythonFunctionInfo j_python_data_stream_function_info = DataStreamPythonFunctionInfo( j_python_data_stream_scalar_function, func_type) j_conf = gateway.jvm.org.apache.flink.configuration.Configuration() DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \ .operators.python.DataStreamTwoInputPythonStatelessFunctionOperator j_python_data_stream_function_operator = DataStreamPythonFunctionOperator( j_conf, j_input_types1, j_input_types2, output_type_info.get_java_type_info(), j_python_data_stream_function_info, self._is_keyed_stream()) return j_python_data_stream_function_operator, output_type_info.get_java_type_info()
def _get_java_python_function_operator(self, func: Union[Function, FunctionWrapper], type_info: TypeInformation, func_name: str, func_type: int): """ Create a flink operator according to user provided function object, data types, function name and function type. :param func: a function object that implements the Function interface. :param type_info: the data type of the function output data. :param func_name: function name. :param func_type: function type, supports MAP, FLAT_MAP, etc. :return: A flink java operator which is responsible for execution user defined python function. """ gateway = get_gateway() import cloudpickle serialized_func = cloudpickle.dumps(func) j_input_types = self._j_data_stream.getTransformation().getOutputType() if type_info is None: output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO( ) else: if isinstance(type_info, list): output_type_info = RowTypeInfo(type_info) else: output_type_info = type_info DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \ .python.DataStreamPythonFunction j_python_data_stream_scalar_function = DataStreamPythonFunction( func_name, bytearray(serialized_func), _get_python_env()) DataStreamPythonFunctionInfo = gateway.jvm. \ org.apache.flink.datastream.runtime.functions.python \ .DataStreamPythonFunctionInfo j_python_data_stream_function_info = DataStreamPythonFunctionInfo( j_python_data_stream_scalar_function, func_type) j_env = self._j_data_stream.getExecutionEnvironment() PythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil j_conf = PythonConfigUtil.getMergedConfig(j_env) DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \ .operators.python.DataStreamPythonStatelessFunctionOperator j_python_data_stream_scalar_function_operator = DataStreamPythonFunctionOperator( j_conf, j_input_types, output_type_info.get_java_type_info(), j_python_data_stream_function_info) return j_python_data_stream_scalar_function_operator, output_type_info
def _build_orc_job(self, row_type: RowType, row_type_info: RowTypeInfo, data: List[Row]): jvm = get_gateway().jvm sink = FileSink.for_bulk_format( self.orc_dir_name, OrcBulkWriters.for_row_type(row_type) ).build() j_list = jvm.java.util.ArrayList() for d in data: j_list.add(to_java_data_structure(d)) ds = DataStream(self.env._j_stream_execution_environment.fromCollection( j_list, row_type_info.get_java_type_info() )) ds.sink_to(sink)
class OutputTag(object): """ An :class:`OutputTag` is a typed and named tag to use for tagging side outputs of an operator. Example: :: # Explicitly specify output type >>> info = OutputTag("late-data", Types.TUPLE([Types.STRING(), Types.LONG()])) # Implicitly wrap list to Types.ROW >>> info_row = OutputTag("row", [Types.STRING(), Types.LONG()]) # Implicitly use pickle serialization >>> info_side = OutputTag("side") # ERROR: tag id cannot be empty string (extra requirement for Python API) >>> info_error = OutputTag("") """ def __init__(self, tag_id: str, type_info: Optional[Union[TypeInformation, list]] = None): if not tag_id: raise ValueError("OutputTag tag_id cannot be None or empty string") self.tag_id = tag_id if type_info is None: self.type_info = Types.PICKLED_BYTE_ARRAY() elif isinstance(type_info, list): self.type_info = RowTypeInfo(type_info) elif not isinstance(type_info, TypeInformation): raise TypeError( "OutputTag type_info must be None, list or TypeInformation") else: self.type_info = type_info def get_java_output_tag(self): gateway = get_gateway() j_obj = gateway.jvm.org.apache.flink.util.OutputTag( self.tag_id, self.type_info.get_java_type_info()) # deal with serializability self.type_info._j_typeinfo = None return j_obj
def _get_java_python_function_operator(self, func: Union[Function, FunctionWrapper], type_info: TypeInformation, func_name: str, func_type: int): """ Create a flink operator according to user provided function object, data types, function name and function type. :param func: a function object that implements the Function interface. :param type_info: the data type of the function output data. :param func_name: function name. :param func_type: function type, supports MAP, FLAT_MAP, etc. :return: A flink java operator which is responsible for execution user defined python function. """ gateway = get_gateway() import cloudpickle serialized_func = cloudpickle.dumps(func) j_input_types = self._j_data_stream.getTransformation().getOutputType() if type_info is None: output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO( ) else: if isinstance(type_info, list): output_type_info = RowTypeInfo(type_info) else: output_type_info = type_info DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \ .python.DataStreamPythonFunction j_python_data_stream_scalar_function = DataStreamPythonFunction( func_name, bytearray(serialized_func), _get_python_env()) DataStreamPythonFunctionInfo = gateway.jvm. \ org.apache.flink.datastream.runtime.functions.python \ .DataStreamPythonFunctionInfo j_python_data_stream_function_info = DataStreamPythonFunctionInfo( j_python_data_stream_scalar_function, func_type) j_conf = gateway.jvm.org.apache.flink.configuration.Configuration() # set max bundle size to 1 to force synchronize process for reduce function. from pyflink.fn_execution.flink_fn_execution_pb2 import UserDefinedDataStreamFunction if func_type == UserDefinedDataStreamFunction.REDUCE: j_conf.setInteger( gateway.jvm.org.apache.flink.python.PythonOptions. MAX_BUNDLE_SIZE, 1) DataStreamPythonReduceFunctionOperator = gateway.jvm.org.apache.flink.datastream \ .runtime.operators.python.DataStreamPythonReduceFunctionOperator j_output_type_info = j_input_types.getTypeAt(1) j_python_data_stream_function_operator = DataStreamPythonReduceFunctionOperator( j_conf, j_input_types, j_output_type_info, j_python_data_stream_function_info) return j_python_data_stream_function_operator, j_output_type_info else: if str(func) == '_Flink_PartitionCustomMapFunction': DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \ .operators.python.DataStreamPythonPartitionCustomFunctionOperator else: DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \ .operators.python.DataStreamPythonStatelessFunctionOperator j_python_data_stream_function_operator = DataStreamPythonFunctionOperator( j_conf, j_input_types, output_type_info.get_java_type_info(), j_python_data_stream_function_info) return j_python_data_stream_function_operator, output_type_info.get_java_type_info( )