예제 #1
0
    def _create_judf(self):
        gateway = get_gateway()

        def get_python_function_kind(udf_type):
            JPythonFunctionKind = gateway.jvm.org.apache.flink.table.functions.python.\
                PythonFunctionKind
            if udf_type == "general":
                return JPythonFunctionKind.GENERAL
            elif udf_type == "pandas":
                return JPythonFunctionKind.PANDAS
            else:
                raise TypeError("Unsupported udf_type: %s." % udf_type)

        func = self._func
        if not isinstance(self._func, UserDefinedFunction):
            func = DelegatingScalarFunction(self._func)

        import cloudpickle
        serialized_func = cloudpickle.dumps(func)

        j_input_types = utils.to_jarray(
            gateway.jvm.TypeInformation,
            [_to_java_type(i) for i in self._input_types])
        j_result_type = _to_java_type(self._result_type)
        j_function_kind = get_python_function_kind(self._udf_type)
        PythonScalarFunction = gateway.jvm \
            .org.apache.flink.table.functions.python.PythonScalarFunction
        j_scalar_function = PythonScalarFunction(self._name,
                                                 bytearray(serialized_func),
                                                 j_input_types, j_result_type,
                                                 j_function_kind,
                                                 self._deterministic,
                                                 _get_python_env())
        return j_scalar_function
예제 #2
0
파일: udf.py 프로젝트: zhuzhurk/flink
    def _create_judtf(self):
        func = self._func
        if not isinstance(self._func, UserDefinedFunction):
            func = DelegationTableFunction(self._func)

        import cloudpickle
        serialized_func = cloudpickle.dumps(func)

        gateway = get_gateway()
        if self._input_types is not None:
            j_input_types = utils.to_jarray(
                gateway.jvm.TypeInformation,
                [_to_java_type(i) for i in self._input_types])
        else:
            j_input_types = None

        j_result_types = utils.to_jarray(
            gateway.jvm.TypeInformation,
            [_to_java_type(i) for i in self._result_types])
        j_result_type = gateway.jvm.org.apache.flink.api.java.typeutils.RowTypeInfo(
            j_result_types)
        j_function_kind = gateway.jvm.org.apache.flink.table.functions.python. \
            PythonFunctionKind.GENERAL
        PythonTableFunction = gateway.jvm \
            .org.apache.flink.table.functions.python.PythonTableFunction
        j_table_function = PythonTableFunction(self._name,
                                               bytearray(serialized_func),
                                               j_input_types, j_result_type,
                                               j_function_kind,
                                               self._deterministic,
                                               _get_python_env())
        return j_table_function
예제 #3
0
    def _create_judf(self, is_blink_planner, table_config):
        func = self._func
        if not isinstance(self._func, UserDefinedFunction):
            func = DelegatingScalarFunction(self._func)

        import cloudpickle
        serialized_func = cloudpickle.dumps(func)

        gateway = get_gateway()
        j_input_types = utils.to_jarray(
            gateway.jvm.TypeInformation,
            [_to_java_type(i) for i in self._input_types])
        j_result_type = _to_java_type(self._result_type)
        if is_blink_planner:
            PythonTableUtils = gateway.jvm\
                .org.apache.flink.table.planner.utils.python.PythonTableUtils
            j_scalar_function = PythonTableUtils \
                .createPythonScalarFunction(table_config,
                                            self._name,
                                            bytearray(serialized_func),
                                            j_input_types,
                                            j_result_type,
                                            self._deterministic,
                                            _get_python_env())
        else:
            PythonTableUtils = gateway.jvm.PythonTableUtils
            j_scalar_function = PythonTableUtils \
                .createPythonScalarFunction(self._name,
                                            bytearray(serialized_func),
                                            j_input_types,
                                            j_result_type,
                                            self._deterministic,
                                            _get_python_env())

        return j_scalar_function
예제 #4
0
 def __init__(self,
              hostname=None,
              port=None,
              line_delimiter=None,
              field_delimiter=None,
              field_names=None,
              field_types=None,
              append_proctime=None):
     gateway = get_gateway()
     j_builder = gateway.jvm.org.apache.flink.python.connector.SocketTableSource.Builder()
     if hostname is not None:
         j_builder.withHostname(hostname)
     if port is not None:
         j_builder.withPort(port)
     if line_delimiter is not None:
         j_builder.withLineDelimiter(line_delimiter)
     if field_delimiter is not None:
         j_builder.withFieldDelimiter(field_delimiter)
     if field_names is not None and field_types is not None:
         j_field_names = to_jarray(gateway.jvm.String, field_names)
         j_field_types = to_jarray(gateway.jvm.TypeInformation,
                                   [_to_java_type(field_type) for field_type in field_types])
         j_builder.withSchema(j_field_names, j_field_types)
     if append_proctime is not None:
         j_builder.appendProctime(append_proctime)
     super(SocketTableSource, self).__init__(j_builder.build())
예제 #5
0
 def __init__(self,
              field_names,
              field_types,
              path,
              field_delimiter=',',
              num_files=-1,
              write_mode=None):
     gateway = get_gateway()
     if write_mode == WriteMode.NO_OVERWRITE:
         j_write_mode = gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.NO_OVERWRITE
     elif write_mode == WriteMode.OVERWRITE:
         j_write_mode = gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.OVERWRITE
     elif write_mode is None:
         j_write_mode = None
     else:
         raise Exception('Unsupported write_mode: %s' % write_mode)
     j_csv_table_sink = gateway.jvm.CsvTableSink(path, field_delimiter,
                                                 num_files, j_write_mode)
     j_field_names = utils.to_jarray(gateway.jvm.String, field_names)
     j_field_types = utils.to_jarray(
         gateway.jvm.TypeInformation,
         [_to_java_type(field_type) for field_type in field_types])
     j_csv_table_sink = j_csv_table_sink.configure(j_field_names,
                                                   j_field_types)
     super(CsvTableSink, self).__init__(j_csv_table_sink)
예제 #6
0
 def _from_file(self, filename, schema):
     gateway = get_gateway()
     jds = gateway.jvm.PythonBridgeUtils.createDataSetFromFile(
         self._j_tenv.execEnv(), filename, True)
     return Table(
         gateway.jvm.PythonTableUtils.fromDataSet(self._j_tenv, jds,
                                                  _to_java_type(schema)))
예제 #7
0
    def java_user_defined_function(self):
        if self._judf_placeholder is None:
            gateway = get_gateway()

            def get_python_function_kind():
                JPythonFunctionKind = gateway.jvm.org.apache.flink.table.functions.python. \
                    PythonFunctionKind
                if self._func_type == "general":
                    return JPythonFunctionKind.GENERAL
                elif self._func_type == "pandas":
                    return JPythonFunctionKind.PANDAS
                else:
                    raise TypeError("Unsupported func_type: %s." %
                                    self._func_type)

            if self._input_types is not None:
                j_input_types = utils.to_jarray(
                    gateway.jvm.TypeInformation,
                    [_to_java_type(i) for i in self._input_types])
            else:
                j_input_types = None
            j_function_kind = get_python_function_kind()
            func = self._func
            if not isinstance(self._func, UserDefinedFunction):
                func = self._create_delegate_function()

            import cloudpickle
            serialized_func = cloudpickle.dumps(func)
            self._judf_placeholder = \
                self._create_judf(serialized_func, j_input_types, j_function_kind)
        return self._judf_placeholder
예제 #8
0
    def _create_judf(self, serialized_func, j_input_types, j_function_kind):
        if self._func_type == "pandas":
            from pyflink.table.types import DataTypes
            self._accumulator_type = DataTypes.ARRAY(self._result_type)

        j_result_type = _to_java_type(self._result_type)
        j_accumulator_type = _to_java_type(self._accumulator_type)

        gateway = get_gateway()
        PythonAggregateFunction = gateway.jvm \
            .org.apache.flink.table.functions.python.PythonAggregateFunction
        j_aggregate_function = PythonAggregateFunction(
            self._name, bytearray(serialized_func), j_input_types,
            j_result_type, j_accumulator_type, j_function_kind,
            self._deterministic, _get_python_env())
        return j_aggregate_function
예제 #9
0
 def __init__(self,
              field_names,
              field_types,
              path,
              field_delimiter=',',
              num_files=1,
              write_mode=None):
     # type: (list[str], list[DataType], str, str, int, int) -> None
     gateway = get_gateway()
     if write_mode == WriteMode.NO_OVERWRITE:
         j_write_mode = gateway.jvm.scala.Option.apply(
             gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.
             NO_OVERWRITE)
     elif write_mode == WriteMode.OVERWRITE:
         j_write_mode = gateway.jvm.scala.Option.apply(
             gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.
             OVERWRITE)
     elif write_mode is None:
         j_write_mode = gateway.jvm.scala.Option.empty()
     else:
         raise Exception('Unsupported write_mode: %s' % write_mode)
     j_some_field_delimiter = gateway.jvm.scala.Option.apply(
         field_delimiter)
     j_some_num_files = gateway.jvm.scala.Option.apply(num_files)
     j_csv_table_sink = gateway.jvm.CsvTableSink(path,
                                                 j_some_field_delimiter,
                                                 j_some_num_files,
                                                 j_write_mode)
     j_field_names = utils.to_jarray(gateway.jvm.String, field_names)
     j_field_types = utils.to_jarray(
         gateway.jvm.TypeInformation,
         [_to_java_type(field_type) for field_type in field_types])
     j_csv_table_sink = j_csv_table_sink.configure(j_field_names,
                                                   j_field_types)
     super(CsvTableSink, self).__init__(j_csv_table_sink)
예제 #10
0
    def _from_elements(self, elements, schema):
        """
        Creates a table from a collection of elements.

        :param elements: The elements to create a table from.
        :return: The result :class:`Table`.
        """

        # serializes to a file, and we read the file in java
        temp_file = tempfile.NamedTemporaryFile(delete=False,
                                                dir=tempfile.mkdtemp())
        serializer = BatchedSerializer(self._serializer)
        try:
            try:
                serializer.dump_to_stream(elements, temp_file)
            finally:
                temp_file.close()
            row_type_info = _to_java_type(schema)
            execution_config = self._get_execution_config(
                temp_file.name, schema)
            gateway = get_gateway()
            j_objs = gateway.jvm.PythonBridgeUtils.readPythonObjects(
                temp_file.name, True)
            j_input_format = gateway.jvm.PythonTableUtils.getInputFormat(
                j_objs, row_type_info, execution_config)
            j_table_source = gateway.jvm.PythonInputFormatTableSource(
                j_input_format, row_type_info)

            return Table(self._j_tenv.fromTableSource(j_table_source))
        finally:
            os.unlink(temp_file.name)
 def __init__(self, j_table_sink, field_names, field_types):
     gateway = get_gateway()
     j_field_names = utils.to_jarray(gateway.jvm.String, field_names)
     j_field_types = utils.to_jarray(
         gateway.jvm.TypeInformation,
         [_to_java_type(field_type) for field_type in field_types])
     j_table_sink = j_table_sink.configure(j_field_names, j_field_types)
     super(TestTableSink, self).__init__(j_table_sink)
예제 #12
0
 def __init__(self, source_path, field_names, field_types):
     # type: (str, list[str], list[DataType]) -> None
     gateway = get_gateway()
     j_field_names = utils.to_jarray(gateway.jvm.String, field_names)
     j_field_types = utils.to_jarray(gateway.jvm.TypeInformation,
                                     [_to_java_type(field_type)
                                      for field_type in field_types])
     super(CsvTableSource, self).__init__(
         gateway.jvm.CsvTableSource(source_path, j_field_names, j_field_types))
예제 #13
0
    def schema(self, schema_data_type):
        """
        Sets the format schema with field names and the types. Required if schema is not derived.

        :param schema_data_type: Data type from :class:`DataTypes` that describes the schema.
        :return: This :class:`Csv` object.
        """
        self._j_csv = self._j_csv.schema(_to_java_type(schema_data_type))
        return self
예제 #14
0
 def __init__(self, field_names, field_types, out_row=100000):
     gateway = get_gateway()
     j_field_names = utils.to_jarray(gateway.jvm.String, field_names)
     j_field_types = utils.to_jarray(
         gateway.jvm.TypeInformation,
         [_to_java_type(field_type) for field_type in field_types])
     j_table_sink = gateway.jvm.com.alibaba.flink.sink.PrintTableSink(
         j_field_names, j_field_types, out_row)
     super(PrintTableSink, self).__init__(j_table_sink)
예제 #15
0
 def __init__(self, source_path, field_names, field_types):
     # type: (str, list[str], list[DataType]) -> None
     gateway = get_gateway()
     j_field_names = utils.to_jarray(gateway.jvm.String, field_names)
     j_field_types = utils.to_jarray(
         gateway.jvm.TypeInformation,
         [_to_java_type(field_type) for field_type in field_types])
     super(CsvTableSource, self).__init__(
         gateway.jvm.CsvTableSource(source_path, j_field_names,
                                    j_field_types))
예제 #16
0
파일: sources.py 프로젝트: xixingya/flink
    def __init__(
        self,
        source_path,
        field_names,
        field_types,
        field_delim=None,
        line_delim=None,
        quote_character=None,
        ignore_first_line=None,
        ignore_comments=None,
        lenient=None,
        empty_column_as_null=None,
    ):
        gateway = get_gateway()

        builder = gateway.jvm.CsvTableSource.builder()
        builder.path(source_path)

        for (field_name, field_type) in zip(field_names, field_types):
            builder.field(field_name, _to_java_type(field_type))

        if field_delim is not None:
            builder.fieldDelimiter(field_delim)

        if line_delim is not None:
            builder.lineDelimiter(line_delim)

        if quote_character is not None:
            # Java API has a Character type for this field. At time of writing,
            # Py4J will convert the Python str to Java Character by taking only
            # the first character.  This results in either:
            #   - Silently truncating a Python str with more than one character
            #     with no further type error from either Py4J or Java
            #     CsvTableSource
            #   - java.lang.StringIndexOutOfBoundsException from Py4J for an
            #     empty Python str.  That error can be made more friendly here.
            if len(quote_character) != 1:
                raise ValueError(
                    "Expected a single CSV quote character but got '{}'".
                    format(quote_character))
            builder.quoteCharacter(quote_character)

        if ignore_first_line:
            builder.ignoreFirstLine()

        if ignore_comments is not None:
            builder.commentPrefix(ignore_comments)

        if lenient:
            builder.ignoreParseErrors()

        if empty_column_as_null:
            builder.emptyColumnAsNull()

        super(CsvTableSource, self).__init__(builder.build())
예제 #17
0
 def __init__(self, field_names, field_types):
     gateway = get_gateway()
     j_print_table_sink = gateway.jvm.org.apache.flink.python.connector.PrintTableSink(
     )
     j_field_names = to_jarray(gateway.jvm.String, field_names)
     j_field_types = to_jarray(
         gateway.jvm.TypeInformation,
         [_to_java_type(field_type) for field_type in field_types])
     j_print_table_sink = j_print_table_sink.configure(
         j_field_names, j_field_types)
     super(PrintTableSink, self).__init__(j_print_table_sink)
예제 #18
0
    def test_multiset_type(self):
        test_types = [DataTypes.MULTISET(DataTypes.BIGINT()),
                      DataTypes.MULTISET(DataTypes.STRING()),
                      DataTypes.MULTISET(DataTypes.MULTISET(DataTypes.BIGINT())),
                      DataTypes.MULTISET(DataTypes.MULTISET(DataTypes.STRING()))]

        java_types = [_to_java_type(item) for item in test_types]

        converted_python_types = [_from_java_type(item) for item in java_types]

        self.assertEqual(test_types, converted_python_types)
예제 #19
0
 def __init__(self, field_names=None, data_types=None, j_table_schema=None):
     if j_table_schema is None:
         gateway = get_gateway()
         j_field_names = to_jarray(gateway.jvm.String, field_names)
         j_data_types = to_jarray(
             gateway.jvm.TypeInformation,
             [_to_java_type(item) for item in data_types])
         self._j_table_schema = gateway.jvm.TableSchema(
             j_field_names, j_data_types)
     else:
         self._j_table_schema = j_table_schema
예제 #20
0
    def test_list_view_type(self):
        test_types = [
            DataTypes.LIST_VIEW(DataTypes.BIGINT()),
            DataTypes.LIST_VIEW(DataTypes.STRING())
        ]

        java_types = [_to_java_type(item) for item in test_types]

        converted_python_types = [_from_java_type(item) for item in java_types]

        self.assertEqual(test_types, converted_python_types)
예제 #21
0
    def test_row_type(self):
        test_types = [DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT()),
                                     DataTypes.FIELD("b",
                                                     DataTypes.ROW(
                                                         [DataTypes.FIELD("c",
                                                                          DataTypes.STRING())]))])]

        java_types = [_to_java_type(item) for item in test_types]

        converted_python_types = [_from_java_type(item) for item in java_types]

        self.assertEqual(test_types, converted_python_types)
예제 #22
0
 def _create_judf(self, serialized_func, j_input_types, j_function_kind):
     gateway = get_gateway()
     j_result_type = _to_java_type(self._result_type)
     PythonScalarFunction = gateway.jvm \
         .org.apache.flink.table.functions.python.PythonScalarFunction
     j_scalar_function = PythonScalarFunction(self._name,
                                              bytearray(serialized_func),
                                              j_input_types, j_result_type,
                                              j_function_kind,
                                              self._deterministic,
                                              _get_python_env())
     return j_scalar_function
예제 #23
0
    def test_array_type(self):
        # nullable/not_null flag will be lost during the conversion.
        test_types = [DataTypes.ARRAY(DataTypes.BIGINT()),
                      DataTypes.ARRAY(DataTypes.BIGINT()),
                      DataTypes.ARRAY(DataTypes.STRING()),
                      DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT())),
                      DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))]

        java_types = [_to_java_type(item) for item in test_types]

        converted_python_types = [_from_java_type(item) for item in java_types]

        self.assertEqual(test_types, converted_python_types)
예제 #24
0
    def schema(self, schema_data_type):
        """
        Sets the schema using :class:`DataTypes`.

        JSON objects are represented as ROW types.

        The schema might be nested.

        :param schema_data_type: Data type that describes the schema.
        :return: This object.
        """
        self._j_json = self._j_json.schema(_to_java_type(schema_data_type))
        return self
예제 #25
0
    def test_array_type(self):
        test_types = [DataTypes.ARRAY(DataTypes.BIGINT()),
                      # array type with not null basic data type means primitive array
                      DataTypes.ARRAY(DataTypes.BIGINT().not_null()),
                      DataTypes.ARRAY(DataTypes.STRING()),
                      DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT())),
                      DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))]

        java_types = [_to_java_type(item) for item in test_types]

        converted_python_types = [_from_java_type(item) for item in java_types]

        self.assertEqual(test_types, converted_python_types)
예제 #26
0
def _to_flink_type_string(data_type):
    FlinkTypeConverter = get_java_class("com.alibaba.alink.operator.common.io.types.FlinkTypeConverter")
    if isinstance(data_type, (AlinkDataType,)):
        type_string = data_type.to_type_string()
    else:
        type_string = FlinkTypeConverter.getTypeString(_to_java_type(data_type))
    mapping = {
        "TINYINT": "BYTE",
        "SMALLINT": "SHORT",
        "VARCHAR": "STRING",
    }
    if type_string in mapping:
        type_string = mapping[type_string]
    return type_string
예제 #27
0
 def _from_file(self, filename, schema):
     gateway = get_gateway()
     blink_t_env_class = get_java_class(gateway.jvm.org.apache.flink.table.
                                        api.internal.TableEnvironmentImpl)
     if blink_t_env_class == self._j_tenv.getClass():
         raise NotImplementedError(
             "The operation 'from_elements' in batch mode is currently "
             "not supported when using blink planner.")
     else:
         jds = gateway.jvm.PythonBridgeUtils.createDataSetFromFile(
             self._j_tenv.execEnv(), filename, True)
         return Table(
             gateway.jvm.PythonTableUtils.fromDataSet(
                 self._j_tenv, jds, _to_java_type(schema)))
예제 #28
0
 def get_function_definition(f):
     if isinstance(f, UserDefinedTableFunctionWrapper):
         """
         TypeInference was not supported for TableFunction in the old planner. Use
         TableFunctionDefinition to work around this issue.
         """
         j_result_types = to_jarray(gateway.jvm.TypeInformation,
                                    [_to_java_type(i) for i in f._result_types])
         j_result_type = gateway.jvm.org.apache.flink.api.java.typeutils.RowTypeInfo(
             j_result_types)
         return gateway.jvm.org.apache.flink.table.functions.TableFunctionDefinition(
             'f', f.java_user_defined_function(), j_result_type)
     else:
         return f.java_user_defined_function()
예제 #29
0
    def field(self, field_name, field_type):
        """
        Adds a format field with the field name and the data type or type string. Required.
        This method can be called multiple times. The call order of this method defines
        also the order of the fields in the format.

        :param field_name: The field name.
        :param field_type: The data type or type string of the field.
        :return: This :class:`OldCsv` object.
        """
        if isinstance(field_type, str):
            self._j_csv = self._j_csv.field(field_name, field_type)
        else:
            self._j_csv = self._j_csv.field(field_name, _to_java_type(field_type))
        return self
예제 #30
0
    def field(self, field_name, field_type):
        """
        Adds a format field with the field name and the data type or type string. Required.
        This method can be called multiple times. The call order of this method defines
        also the order of the fields in the format.

        :param field_name: The field name.
        :param field_type: The data type or type string of the field.
        :return: This :class:`OldCsv` object.
        """
        if isinstance(field_type, (str, unicode)):
            self._j_csv = self._j_csv.field(field_name, field_type)
        else:
            self._j_csv = self._j_csv.field(field_name, _to_java_type(field_type))
        return self
예제 #31
0
    def field(self, field_name, field_type):
        """
        Adds a field with the field name and the data type or type string. Required.
        This method can be called multiple times. The call order of this method defines
        also the order of the fields in a row. Here is a document that introduces the type strings:
        https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connect.html#type-strings

        :param field_name: The field name.
        :param field_type: The data type or type string of the field.
        :return: This schema object.
        """
        if isinstance(field_type, (str, unicode)):
            self._j_schema = self._j_schema.field(field_name, field_type)
        else:
            self._j_schema = self._j_schema.field(field_name, _to_java_type(field_type))
        return self
예제 #32
0
    def field(self, field_name, field_type):
        """
        Adds a field with the field name and the data type or type string. Required.
        This method can be called multiple times. The call order of this method defines
        also the order of the fields in a row. Here is a document that introduces the type strings:
        https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connect.html#type-strings

        :param field_name: The field name.
        :param field_type: The data type or type string of the field.
        :return: This schema object.
        """
        if isinstance(field_type, str):
            self._j_schema = self._j_schema.field(field_name, field_type)
        else:
            self._j_schema = self._j_schema.field(field_name, _to_java_type(field_type))
        return self
예제 #33
0
 def _create_judf(self, serialized_func, j_input_types, j_function_kind):
     gateway = get_gateway()
     j_result_types = utils.to_jarray(
         gateway.jvm.TypeInformation,
         [_to_java_type(i) for i in self._result_types])
     j_result_type = gateway.jvm.org.apache.flink.api.java.typeutils.RowTypeInfo(
         j_result_types)
     PythonTableFunction = gateway.jvm \
         .org.apache.flink.table.functions.python.PythonTableFunction
     j_table_function = PythonTableFunction(self._name,
                                            bytearray(serialized_func),
                                            j_input_types, j_result_type,
                                            j_function_kind,
                                            self._deterministic,
                                            _get_python_env())
     return j_table_function