Exemplo n.º 1
0
    def _get_java_python_function_operator(self, func: Union[Function,
                                                             FunctionWrapper],
                                           type_info: TypeInformation,
                                           func_name: str, func_type: int):
        """
        Create a flink operator according to user provided function object, data types,
        function name and function type.

        :param func: a function object that implements the Function interface.
        :param type_info: the data type of the function output data.
        :param func_name: function name.
        :param func_type: function type, supports MAP, FLAT_MAP, etc.
        :return: A flink java operator which is responsible for execution user defined python
                 function.
        """

        gateway = get_gateway()
        import cloudpickle
        serialized_func = cloudpickle.dumps(func)

        j_input_types = self._j_data_stream.getTransformation().getOutputType()

        if type_info is None:
            output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO(
            )
        else:
            if isinstance(type_info, list):
                output_type_info = RowTypeInfo(type_info)
            else:
                output_type_info = type_info

        DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \
            .python.DataStreamPythonFunction
        j_python_data_stream_scalar_function = DataStreamPythonFunction(
            func_name, bytearray(serialized_func), _get_python_env())

        DataStreamPythonFunctionInfo = gateway.jvm. \
            org.apache.flink.datastream.runtime.functions.python \
            .DataStreamPythonFunctionInfo

        j_python_data_stream_function_info = DataStreamPythonFunctionInfo(
            j_python_data_stream_scalar_function, func_type)

        j_env = self._j_data_stream.getExecutionEnvironment()
        PythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil
        j_conf = PythonConfigUtil.getMergedConfig(j_env)

        DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \
            .operators.python.DataStreamPythonStatelessFunctionOperator

        j_python_data_stream_scalar_function_operator = DataStreamPythonFunctionOperator(
            j_conf, j_input_types, output_type_info.get_java_type_info(),
            j_python_data_stream_function_info)
        return j_python_data_stream_scalar_function_operator, output_type_info
Exemplo n.º 2
0
    def test_row_type(self):
        self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()])
                         .get_field_names(), ['f0', 'f1'])
        self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()],
                                     ['a', 'b']).get_field_names(), ['a', 'b'])

        self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()],
                                     ['a', 'b']) == RowTypeInfo([Types.STRING(),
                                                                 Types.STRING()], ['a', 'b']), True)
        self.assertEqual(RowTypeInfo([Types.STRING(),
                                      Types.STRING()],
                                     ['a', 'b']) == RowTypeInfo([Types.STRING(),
                                                                Types.INT()],
                                                                ['a', 'b']), False)
        self.assertEqual(RowTypeInfo([Types.STRING(),
                                      Types.STRING()],
                                     ['a', 'b']).__str__(), "RowTypeInfo(a: String, b: String)")

        self.assertEqual(Types.ROW([Types.STRING(), Types.STRING()]),
                         RowTypeInfo([Types.STRING(), Types.STRING()]), True)

        self.assertEqual(Types.ROW_NAMED(['a', 'b'], [Types.STRING(), Types.STRING()])
                         .get_field_names(), ['a', 'b'], True)

        self.assertEqual(Types.ROW_NAMED(['a', 'b'], [Types.STRING(), Types.STRING()])
                         .get_field_types(), [Types.STRING(), Types.STRING()], True)
Exemplo n.º 3
0
 def __init__(self,
              tag_id: str,
              type_info: Optional[Union[TypeInformation, list]] = None):
     if not tag_id:
         raise ValueError("OutputTag tag_id cannot be None or empty string")
     self.tag_id = tag_id
     if type_info is None:
         self.type_info = Types.PICKLED_BYTE_ARRAY()
     elif isinstance(type_info, list):
         self.type_info = RowTypeInfo(type_info)
     elif not isinstance(type_info, TypeInformation):
         raise TypeError(
             "OutputTag type_info must be None, list or TypeInformation")
     else:
         self.type_info = type_info
Exemplo n.º 4
0
    def _get_connected_stream_operator(self, func: Union[Function, FunctionWrapper],
                                       type_info: TypeInformation, func_name: str,
                                       func_type: int):
        gateway = get_gateway()
        import cloudpickle
        serialized_func = cloudpickle.dumps(func)

        j_input_types1 = self.stream1._j_data_stream.getTransformation().getOutputType()
        j_input_types2 = self.stream2._j_data_stream.getTransformation().getOutputType()

        if type_info is None:
            output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO()
        else:
            if isinstance(type_info, list):
                output_type_info = RowTypeInfo(type_info)
            else:
                output_type_info = type_info

        DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \
            .python.DataStreamPythonFunction
        j_python_data_stream_scalar_function = DataStreamPythonFunction(
            func_name,
            bytearray(serialized_func),
            _get_python_env())

        DataStreamPythonFunctionInfo = gateway.jvm. \
            org.apache.flink.datastream.runtime.functions.python \
            .DataStreamPythonFunctionInfo

        j_python_data_stream_function_info = DataStreamPythonFunctionInfo(
            j_python_data_stream_scalar_function,
            func_type)

        j_conf = gateway.jvm.org.apache.flink.configuration.Configuration()
        DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \
            .operators.python.DataStreamTwoInputPythonStatelessFunctionOperator
        j_python_data_stream_function_operator = DataStreamPythonFunctionOperator(
            j_conf,
            j_input_types1,
            j_input_types2,
            output_type_info.get_java_type_info(),
            j_python_data_stream_function_info,
            self._is_keyed_stream())

        return j_python_data_stream_function_operator, output_type_info.get_java_type_info()
Exemplo n.º 5
0
 def _build_orc_job(self, row_type: RowType, row_type_info: RowTypeInfo, data: List[Row]):
     jvm = get_gateway().jvm
     sink = FileSink.for_bulk_format(
         self.orc_dir_name, OrcBulkWriters.for_row_type(row_type)
     ).build()
     j_list = jvm.java.util.ArrayList()
     for d in data:
         j_list.add(to_java_data_structure(d))
     ds = DataStream(self.env._j_stream_execution_environment.fromCollection(
         j_list,
         row_type_info.get_java_type_info()
     ))
     ds.sink_to(sink)
Exemplo n.º 6
0
class OutputTag(object):
    """
    An :class:`OutputTag` is a typed and named tag to use for tagging side outputs of an operator.

    Example:
    ::

        # Explicitly specify output type
        >>> info = OutputTag("late-data", Types.TUPLE([Types.STRING(), Types.LONG()]))
        # Implicitly wrap list to Types.ROW
        >>> info_row = OutputTag("row", [Types.STRING(), Types.LONG()])
        # Implicitly use pickle serialization
        >>> info_side = OutputTag("side")
        # ERROR: tag id cannot be empty string (extra requirement for Python API)
        >>> info_error = OutputTag("")

    """
    def __init__(self,
                 tag_id: str,
                 type_info: Optional[Union[TypeInformation, list]] = None):
        if not tag_id:
            raise ValueError("OutputTag tag_id cannot be None or empty string")
        self.tag_id = tag_id
        if type_info is None:
            self.type_info = Types.PICKLED_BYTE_ARRAY()
        elif isinstance(type_info, list):
            self.type_info = RowTypeInfo(type_info)
        elif not isinstance(type_info, TypeInformation):
            raise TypeError(
                "OutputTag type_info must be None, list or TypeInformation")
        else:
            self.type_info = type_info

    def get_java_output_tag(self):
        gateway = get_gateway()
        j_obj = gateway.jvm.org.apache.flink.util.OutputTag(
            self.tag_id, self.type_info.get_java_type_info())
        # deal with serializability
        self.type_info._j_typeinfo = None
        return j_obj
Exemplo n.º 7
0
    def sink(sql: str,
             type_info: RowTypeInfo,
             jdbc_connection_options: 'JdbcConnectionOptions',
             jdbc_execution_options: 'JdbcExecutionOptions' = None):
        """
        Create a JDBC sink.

        :param sql: arbitrary DML query (e.g. insert, update, upsert)
        :param type_info: A RowTypeInfo for query field types.
        :param jdbc_execution_options:  parameters of execution, such as batch size and maximum
                                        retries.
        :param jdbc_connection_options: parameters of connection, such as JDBC URL.
        :return: A JdbcSink.
        """
        sql_types = []
        gateway = get_gateway()
        JJdbcTypeUtil = gateway.jvm.org.apache.flink.connector.jdbc.utils.JdbcTypeUtil
        for field_type in type_info.get_field_types():
            sql_types.append(
                JJdbcTypeUtil.typeInformationToSqlType(
                    field_type.get_java_type_info()))
        j_sql_type = to_jarray(gateway.jvm.int, sql_types)
        output_format_clz = gateway.jvm.Class\
            .forName('org.apache.flink.connector.jdbc.internal.JdbcBatchingOutputFormat', False,
                     get_gateway().jvm.Thread.currentThread().getContextClassLoader())
        j_int_array_type = to_jarray(gateway.jvm.int, []).getClass()
        j_builder_method = output_format_clz.getDeclaredMethod(
            'createRowJdbcStatementBuilder',
            to_jarray(gateway.jvm.Class, [j_int_array_type]))
        j_builder_method.setAccessible(True)
        j_statement_builder = j_builder_method.invoke(
            None, to_jarray(gateway.jvm.Object, [j_sql_type]))

        jdbc_execution_options = jdbc_execution_options if jdbc_execution_options is not None \
            else JdbcExecutionOptions.defaults()
        j_jdbc_sink = gateway.jvm.org.apache.flink.connector.jdbc.JdbcSink\
            .sink(sql, j_statement_builder, jdbc_execution_options._j_jdbc_execution_options,
                  jdbc_connection_options._j_jdbc_connection_options)
        return JdbcSink(j_jdbc_sink=j_jdbc_sink)
Exemplo n.º 8
0
    def _get_java_python_function_operator(self, func: Union[Function,
                                                             FunctionWrapper],
                                           type_info: TypeInformation,
                                           func_name: str, func_type: int):
        """
        Create a flink operator according to user provided function object, data types,
        function name and function type.

        :param func: a function object that implements the Function interface.
        :param type_info: the data type of the function output data.
        :param func_name: function name.
        :param func_type: function type, supports MAP, FLAT_MAP, etc.
        :return: A flink java operator which is responsible for execution user defined python
                 function.
        """

        gateway = get_gateway()
        import cloudpickle
        serialized_func = cloudpickle.dumps(func)

        j_input_types = self._j_data_stream.getTransformation().getOutputType()

        if type_info is None:
            output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO(
            )
        else:
            if isinstance(type_info, list):
                output_type_info = RowTypeInfo(type_info)
            else:
                output_type_info = type_info

        DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \
            .python.DataStreamPythonFunction
        j_python_data_stream_scalar_function = DataStreamPythonFunction(
            func_name, bytearray(serialized_func), _get_python_env())

        DataStreamPythonFunctionInfo = gateway.jvm. \
            org.apache.flink.datastream.runtime.functions.python \
            .DataStreamPythonFunctionInfo

        j_python_data_stream_function_info = DataStreamPythonFunctionInfo(
            j_python_data_stream_scalar_function, func_type)

        j_conf = gateway.jvm.org.apache.flink.configuration.Configuration()

        # set max bundle size to 1 to force synchronize process for reduce function.
        from pyflink.fn_execution.flink_fn_execution_pb2 import UserDefinedDataStreamFunction
        if func_type == UserDefinedDataStreamFunction.REDUCE:
            j_conf.setInteger(
                gateway.jvm.org.apache.flink.python.PythonOptions.
                MAX_BUNDLE_SIZE, 1)
            DataStreamPythonReduceFunctionOperator = gateway.jvm.org.apache.flink.datastream \
                .runtime.operators.python.DataStreamPythonReduceFunctionOperator

            j_output_type_info = j_input_types.getTypeAt(1)
            j_python_data_stream_function_operator = DataStreamPythonReduceFunctionOperator(
                j_conf, j_input_types, j_output_type_info,
                j_python_data_stream_function_info)
            return j_python_data_stream_function_operator, j_output_type_info
        else:
            if str(func) == '_Flink_PartitionCustomMapFunction':
                DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \
                    .operators.python.DataStreamPythonPartitionCustomFunctionOperator
            else:
                DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \
                    .operators.python.DataStreamPythonStatelessFunctionOperator

            j_python_data_stream_function_operator = DataStreamPythonFunctionOperator(
                j_conf, j_input_types, output_type_info.get_java_type_info(),
                j_python_data_stream_function_info)

            return j_python_data_stream_function_operator, output_type_info.get_java_type_info(
            )