Пример #1
0
 def for_schema(schema: 'CsvSchema') -> 'CsvReaderFormat':
     """
     Builds a :class:`CsvReaderFormat` using `CsvSchema`.
     """
     jvm = get_gateway().jvm
     jackson = jvm.org.apache.flink.shaded.jackson2.com.fasterxml.jackson
     constructor = get_java_class(jvm.org.apache.flink.formats.csv.CsvReaderFormat) \
         .getDeclaredConstructor(
         to_jarray(jvm.Class, [
             get_java_class(jackson.dataformat.csv.CsvMapper),
             get_java_class(jackson.dataformat.csv.CsvSchema),
             get_java_class(jvm.Class),
             get_java_class(jvm.org.apache.flink.formats.common.Converter),
             get_java_class(jvm.org.apache.flink.api.common.typeinfo.TypeInformation),
             get_java_class(jvm.boolean)
         ])
     )
     constructor.setAccessible(True)
     j_csv_format = constructor.newInstance(
         to_jarray(jvm.Object, [
             jackson.dataformat.csv.CsvMapper(), schema._j_schema,
             get_java_class(jackson.databind.JsonNode),
             jvm.org.apache.flink.formats.csv.CsvToRowDataConverters(
                 False).createRowConverter(
                     _to_java_data_type(schema._data_type).getLogicalType(),
                     True),
             jvm.org.apache.flink.table.runtime.typeutils.InternalTypeInfo.
             of(_to_java_data_type(
                 schema._data_type).getLogicalType()), False
         ]))
     return CsvReaderFormat(j_csv_format)
Пример #2
0
    def _create_judf(self, serialized_func, j_input_types, j_function_kind):
        if self._func_type == "pandas":
            from pyflink.table.types import DataTypes
            self._accumulator_type = DataTypes.ARRAY(self._result_type)

        if j_input_types is not None:
            gateway = get_gateway()
            j_input_types = java_utils.to_jarray(
                gateway.jvm.DataType,
                [_to_java_data_type(i) for i in self._input_types])
        j_result_type = _to_java_data_type(self._result_type)
        j_accumulator_type = _to_java_data_type(self._accumulator_type)

        gateway = get_gateway()
        if self._is_table_aggregate:
            PythonAggregateFunction = gateway.jvm \
                .org.apache.flink.table.functions.python.PythonTableAggregateFunction
        else:
            PythonAggregateFunction = gateway.jvm \
                .org.apache.flink.table.functions.python.PythonAggregateFunction
        j_aggregate_function = PythonAggregateFunction(
            self._name, bytearray(serialized_func), j_input_types,
            j_result_type, j_accumulator_type, j_function_kind,
            self._deterministic, self._takes_row_as_input, _get_python_env())
        return j_aggregate_function
Пример #3
0
    def cast(self, data_type: DataType) -> 'Expression':
        """
        Converts a value to a given data type.

        e.g. lit("42").cast(DataTypes.INT()) leads to 42.
        """
        return _binary_op("cast")(self, _to_java_data_type(data_type))
Пример #4
0
    def __init__(self,
                 field_names,
                 field_types,
                 path,
                 field_delimiter=',',
                 num_files=-1,
                 write_mode=None):
        gateway = get_gateway()
        if write_mode == WriteMode.NO_OVERWRITE:
            j_write_mode = gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.NO_OVERWRITE
        elif write_mode == WriteMode.OVERWRITE:
            j_write_mode = gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.OVERWRITE
        elif write_mode is None:
            j_write_mode = None
        else:
            raise Exception('Unsupported write_mode: %s' % write_mode)
        j_field_names = java_utils.to_jarray(gateway.jvm.String, field_names)
        j_field_types = java_utils.to_jarray(
            gateway.jvm.DataType,
            [_to_java_data_type(field_type) for field_type in field_types])
        j_csv_table_sink = gateway.jvm.CsvTableSink(path, field_delimiter,
                                                    num_files, j_write_mode,
                                                    j_field_names,
                                                    j_field_types)

        super(CsvTableSink, self).__init__(j_csv_table_sink)
Пример #5
0
    def _java_user_defined_function(self):
        if self._judf_placeholder is None:
            gateway = get_gateway()

            def get_python_function_kind():
                JPythonFunctionKind = gateway.jvm.org.apache.flink.table.functions.python. \
                    PythonFunctionKind
                if self._func_type == "general":
                    return JPythonFunctionKind.GENERAL
                elif self._func_type == "pandas":
                    return JPythonFunctionKind.PANDAS
                else:
                    raise TypeError("Unsupported func_type: %s." %
                                    self._func_type)

            if self._input_types is not None:
                j_input_types = java_utils.to_jarray(
                    gateway.jvm.DataType,
                    [_to_java_data_type(i) for i in self._input_types])
            else:
                j_input_types = None
            j_function_kind = get_python_function_kind()
            func = self._func
            if not isinstance(self._func, UserDefinedFunction):
                func = self._create_delegate_function()

            import cloudpickle
            serialized_func = cloudpickle.dumps(func)
            self._judf_placeholder = \
                self._create_judf(serialized_func, j_input_types, j_function_kind)
        return self._judf_placeholder
Пример #6
0
 def get_type_info(self):
     if self._type_info is None:
         jvm = get_gateway().jvm
         j_type_info = jvm.org.apache.flink.table.types.utils.LegacyTypeInfoDataTypeConverter \
             .toLegacyTypeInfo(_to_java_data_type(self._row_type))
         self._type_info = _from_java_type(j_type_info)
     return self._type_info
Пример #7
0
def _write_row_data_to_parquet_file(path: str, row_type: RowType, rows: List[Row]):
    jvm = get_gateway().jvm
    flink = jvm.org.apache.flink

    j_output_stream = flink.core.fs.local.LocalDataOutputStream(jvm.java.io.File(path))
    j_bulk_writer = flink.formats.parquet.row.ParquetRowDataBuilder.createWriterFactory(
        _to_java_data_type(row_type).getLogicalType(),
        create_hadoop_configuration(Configuration()),
        True,
    ).create(j_output_stream)
    row_row_converter = flink.table.data.conversion.RowRowConverter.create(
        _to_java_data_type(row_type)
    )
    row_row_converter.open(row_row_converter.getClass().getClassLoader())
    for row in rows:
        j_bulk_writer.addElement(row_row_converter.toInternal(to_java_data_structure(row)))
    j_bulk_writer.finish()
Пример #8
0
                def apply(self, ds):
                    jvm = get_gateway().jvm

                    if _check_if_row_data_type(ds):
                        return ds

                    j_map_function = jvm.org.apache.flink.python.util.PythonConnectorUtils \
                        .RowRowMapper(_to_java_data_type(row_type))
                    return DataStream(ds._j_data_stream.process(j_map_function))
Пример #9
0
    def for_schema(schema: 'CsvSchema') -> 'BulkWriterFactory':
        """
        Builds a :class:`BulkWriterFactory` for writing records to files in CSV format.
        """
        jvm = get_gateway().jvm
        csv = jvm.org.apache.flink.formats.csv

        j_factory = csv.PythonCsvUtils.createCsvBulkWriterFactory(
            schema._j_schema, _to_java_data_type(schema._row_type))
        return RowDataBulkWriterFactory(j_factory, schema._row_type)
Пример #10
0
 def __init__(self, field_names, field_types):
     TestTableSink._ensure_initialized()
     gateway = get_gateway()
     j_field_names = java_utils.to_jarray(gateway.jvm.String, field_names)
     j_field_types = java_utils.to_jarray(
         gateway.jvm.DataType,
         [_to_java_data_type(field_type) for field_type in field_types])
     super(TestRetractSink, self).__init__(
         gateway.jvm.org.apache.flink.table.utils.TestingSinks.
         TestAppendingSink(j_field_names, j_field_types))
Пример #11
0
    def __init__(
        self,
        source_path,
        field_names,
        field_types,
        field_delim=None,
        line_delim=None,
        quote_character=None,
        ignore_first_line=None,
        ignore_comments=None,
        lenient=None,
        empty_column_as_null=None,
    ):
        gateway = get_gateway()

        builder = gateway.jvm.CsvTableSource.builder()
        builder.path(source_path)

        for (field_name, field_type) in zip(field_names, field_types):
            builder.field(field_name, _to_java_data_type(field_type))

        if field_delim is not None:
            builder.fieldDelimiter(field_delim)

        if line_delim is not None:
            builder.lineDelimiter(line_delim)

        if quote_character is not None:
            # Java API has a Character type for this field. At time of writing,
            # Py4J will convert the Python str to Java Character by taking only
            # the first character.  This results in either:
            #   - Silently truncating a Python str with more than one character
            #     with no further type error from either Py4J or Java
            #     CsvTableSource
            #   - java.lang.StringIndexOutOfBoundsException from Py4J for an
            #     empty Python str.  That error can be made more friendly here.
            if len(quote_character) != 1:
                raise ValueError(
                    "Expected a single CSV quote character but got '{}'".format(quote_character)
                )
            builder.quoteCharacter(quote_character)

        if ignore_first_line:
            builder.ignoreFirstLine()

        if ignore_comments is not None:
            builder.commentPrefix(ignore_comments)

        if lenient:
            builder.ignoreParseErrors()

        if empty_column_as_null:
            builder.emptyColumnAsNull()

        super(CsvTableSource, self).__init__(builder.build())
Пример #12
0
    def for_row_type(row_type: RowType,
                     writer_properties: Optional[Configuration] = None,
                     hadoop_config: Optional[Configuration] = None) \
            -> BulkWriterFactory:
        """
        Create a RowDataBulkWriterFactory that writes Row records with a defined RowType into Orc
        files in a batch fashion.

        Example:
        ::

            >>> row_type = DataTypes.ROW([
            ...     DataTypes.FIELD('string', DataTypes.STRING()),
            ...     DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT()))
            ... ])
            >>> row_type_info = Types.ROW_NAMED(
            ...     ['string', 'int_array'],
            ...     [Types.STRING(), Types.LIST(Types.INT())]
            ... )
            >>> sink = FileSink.for_bulk_format(
            ...     OUTPUT_DIR, OrcBulkWriters.for_row_type(
            ...         row_type=row_type,
            ...         writer_properties=Configuration(),
            ...         hadoop_config=Configuration(),
            ...     )
            ... ).build()
            >>> ds.map(lambda e: e, output_type=row_type_info).sink_to(sink)

        Note that in the above example, an identity map to indicate its RowTypeInfo is necessary
        before ``sink_to`` when ``ds`` is a source stream producing **RowData** records,
        because RowDataBulkWriterFactory assumes the input record type is Row.
        """
        if not isinstance(row_type, RowType):
            raise TypeError('row_type must be an instance of RowType')

        j_data_type = _to_java_data_type(row_type)
        jvm = get_gateway().jvm
        j_row_type = j_data_type.getLogicalType()
        orc_types = to_jarray(
            jvm.org.apache.flink.table.types.logical.LogicalType,
            [i for i in j_row_type.getChildren()])
        type_description = jvm.org.apache.flink.orc \
            .OrcSplitReaderUtil.logicalTypeToOrcType(j_row_type)
        if writer_properties is None:
            writer_properties = Configuration()
        if hadoop_config is None:
            hadoop_config = Configuration()

        return RowDataBulkWriterFactory(
            jvm.org.apache.flink.orc.writer.OrcBulkWriterFactory(
                jvm.org.apache.flink.orc.vector.RowDataVectorizer(
                    type_description.toString(), orc_types),
                create_java_properties(writer_properties),
                create_hadoop_configuration(hadoop_config)), row_type)
Пример #13
0
 def for_schema(schema: 'CsvSchema') -> 'CsvReaderFormat':
     """
     Builds a :class:`CsvReaderFormat` using `CsvSchema`.
     """
     jvm = get_gateway().jvm
     j_csv_format = jvm.org.apache.flink.formats.csv.CsvReaderFormatFactory \
         .createCsvReaderFormat(
             schema._j_schema,
             _to_java_data_type(schema._data_type)
         )
     return CsvReaderFormat(j_csv_format)
Пример #14
0
 def __init__(self, hadoop_config: Configuration, row_type: RowType,
              batch_size: int, is_utc_timestamp: bool,
              is_case_sensitive: bool):
     jvm = get_gateway().jvm
     j_row_type = _to_java_data_type(row_type).getLogicalType()
     produced_type_info = jvm.org.apache.flink.table.runtime.typeutils. \
         InternalTypeInfo.of(j_row_type)
     j_parquet_columnar_format = jvm.org.apache.flink.formats.parquet. \
         ParquetColumnarRowInputFormat(self._create_hadoop_configuration(hadoop_config),
                                       j_row_type, produced_type_info, batch_size,
                                       is_utc_timestamp, is_case_sensitive)
     super().__init__(j_parquet_columnar_format)
Пример #15
0
 def for_schema(schema: 'CsvSchema') -> 'CsvReaderFormat':
     """
     Builds a :class:`CsvReaderFormat` using `CsvSchema`.
     """
     from pyflink.table.types import _to_java_data_type
     jvm = get_gateway().jvm
     j_csv_format = jvm.org.apache.flink.formats.csv.PythonCsvUtils \
         .createCsvReaderFormat(
             schema._j_schema,
             _to_java_data_type(schema._row_type)
         )
     return CsvReaderFormat(j_csv_format)
Пример #16
0
 def from_fields(self, field_names: List[str],
                 field_data_types: List[DataType]) -> 'Schema.Builder':
     """
     Adopts the given field names and field data types as physical columns of the schema.
     """
     gateway = get_gateway()
     j_field_names = to_jarray(gateway.jvm.String, field_names)
     j_field_data_types = to_jarray(gateway.jvm.AbstractDataType, [
         _to_java_data_type(field_data_type)
         for field_data_type in field_data_types
     ])
     self._j_builder.fromFields(j_field_names, j_field_data_types)
     return self
Пример #17
0
    def for_schema(schema: 'CsvSchema') -> 'BulkWriterFactory':
        """
        Creates a :class:`~pyflink.common.serialization.BulkWriterFactory` for writing records to
        files in CSV format.
        """
        from pyflink.table.types import _to_java_data_type

        jvm = get_gateway().jvm
        csv = jvm.org.apache.flink.formats.csv

        j_factory = csv.PythonCsvUtils.createCsvBulkWriterFactory(
            schema._j_schema, _to_java_data_type(schema._row_type))
        return RowDataBulkWriterFactory(j_factory, schema._row_type)
Пример #18
0
    def test_map_view_type(self):
        test_types = [
            DataTypes.MAP_VIEW(DataTypes.STRING(), DataTypes.BIGINT()),
            DataTypes.MAP_VIEW(DataTypes.INT(), DataTypes.STRING())
        ]

        java_types = [_to_java_data_type(item) for item in test_types]

        converted_python_types = [
            _from_java_data_type(item) for item in java_types
        ]

        self.assertEqual(test_types, converted_python_types)
Пример #19
0
 def _create_judf(self, serialized_func, j_input_types, j_function_kind):
     gateway = get_gateway()
     j_result_type = _to_java_data_type(self._result_type)
     PythonScalarFunction = gateway.jvm \
         .org.apache.flink.table.functions.python.PythonScalarFunction
     j_scalar_function = PythonScalarFunction(self._name,
                                              bytearray(serialized_func),
                                              j_input_types, j_result_type,
                                              j_function_kind,
                                              self._deterministic,
                                              self._takes_row_as_input,
                                              _get_python_env())
     return j_scalar_function
Пример #20
0
 def __init__(self,
              field_names: List[str] = None,
              data_types: List[DataType] = None,
              j_table_schema=None):
     if j_table_schema is None:
         gateway = get_gateway()
         j_field_names = to_jarray(gateway.jvm.String, field_names)
         j_data_types = to_jarray(
             gateway.jvm.DataType,
             [_to_java_data_type(item) for item in data_types])
         self._j_table_schema = gateway.jvm.TableSchema.builder()\
             .fields(j_field_names, j_data_types).build()
     else:
         self._j_table_schema = j_table_schema
Пример #21
0
    def test_multiset_type(self):
        test_types = [
            DataTypes.MULTISET(DataTypes.BIGINT()),
            DataTypes.MULTISET(DataTypes.STRING()),
            DataTypes.MULTISET(DataTypes.MULTISET(DataTypes.BIGINT())),
            DataTypes.MULTISET(DataTypes.MULTISET(DataTypes.STRING()))
        ]

        java_types = [_to_java_data_type(item) for item in test_types]

        converted_python_types = [
            _from_java_data_type(item) for item in java_types
        ]

        self.assertEqual(test_types, converted_python_types)
Пример #22
0
def lit(v, data_type: DataType = None) -> Expression:
    """
    Creates a SQL literal.

    The data type is derived from the object's class and its value. For example, `lit(12)` leads
    to `INT`, `lit("abc")` leads to `CHAR(3)`.

    Example:
    ::

        >>> tab.select(col("key"), lit("abc"))
    """
    if data_type is None:
        return _unary_op("lit", v)
    else:
        return _binary_op("lit", v, _to_java_data_type(data_type))
Пример #23
0
    def field(self, field_name: str, field_type: Union[DataType, str]) -> 'Schema':
        """
        Adds a field with the field name and the data type or type string. Required.
        This method can be called multiple times. The call order of this method defines
        also the order of the fields in a row. Here is a document that introduces the type strings:
        https://nightlies.apache.org/flink/flink-docs-stable/dev/table/connect.html#type-strings

        :param field_name: The field name.
        :param field_type: The data type or type string of the field.
        :return: This schema object.
        """
        if isinstance(field_type, str):
            self._j_schema = self._j_schema.field(field_name, field_type)
        else:
            self._j_schema = self._j_schema.field(field_name, _to_java_data_type(field_type))
        return self
Пример #24
0
 def _create_judf(self, serialized_func, j_input_types, j_function_kind):
     gateway = get_gateway()
     j_result_types = java_utils.to_jarray(
         gateway.jvm.DataType,
         [_to_java_data_type(i) for i in self._result_types])
     j_result_type = gateway.jvm.DataTypes.ROW(j_result_types)
     PythonTableFunction = gateway.jvm \
         .org.apache.flink.table.functions.python.PythonTableFunction
     j_table_function = PythonTableFunction(self._name,
                                            bytearray(serialized_func),
                                            j_input_types, j_result_type,
                                            j_function_kind,
                                            self._deterministic,
                                            self._takes_row_as_input,
                                            _get_python_env())
     return j_table_function
Пример #25
0
    def test_row_type(self):
        test_types = [
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.INT()),
                DataTypes.FIELD(
                    "b",
                    DataTypes.ROW([DataTypes.FIELD("c", DataTypes.STRING())]))
            ])
        ]

        java_types = [_to_java_data_type(item) for item in test_types]

        converted_python_types = [
            _from_java_data_type(item) for item in java_types
        ]

        self.assertEqual(test_types, converted_python_types)
Пример #26
0
    def test_array_type(self):
        # nullable/not_null flag will be lost during the conversion.
        test_types = [
            DataTypes.ARRAY(DataTypes.BIGINT()),
            DataTypes.ARRAY(DataTypes.BIGINT()),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT())),
            DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))
        ]

        java_types = [_to_java_data_type(item) for item in test_types]

        converted_python_types = [
            _from_java_data_type(item) for item in java_types
        ]

        self.assertEqual(test_types, converted_python_types)
Пример #27
0
    def __init__(self,
                 row_type: 'RowType',
                 hadoop_config: Optional[Configuration] = None,
                 batch_size: int = 2048,
                 is_utc_timestamp: bool = False,
                 is_case_sensitive: bool = True):
        if not hadoop_config:
            hadoop_config = Configuration()

        from pyflink.table.types import _to_java_data_type
        jvm = get_gateway().jvm
        j_row_type = _to_java_data_type(row_type).getLogicalType()
        produced_type_info = jvm.org.apache.flink.table.runtime.typeutils. \
            InternalTypeInfo.of(j_row_type)
        j_parquet_columnar_format = jvm.org.apache.flink.formats.parquet. \
            ParquetColumnarRowInputFormat(create_hadoop_configuration(hadoop_config),
                                          j_row_type, produced_type_info, batch_size,
                                          is_utc_timestamp, is_case_sensitive)
        super().__init__(j_parquet_columnar_format)
Пример #28
0
    def for_row_type(row_type: RowType,
                     hadoop_config: Optional[Configuration] = None,
                     utc_timestamp: bool = False) -> 'BulkWriterFactory':
        """
        Create a RowDataBulkWriterFactory that writes Rows records with a defined RowType into
        Parquet files in a batch fashion.

        Example:
        ::

            >>> row_type = DataTypes.ROW([
            ...     DataTypes.FIELD('string', DataTypes.STRING()),
            ...     DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT()))
            ... ])
            >>> row_type_info = Types.ROW_NAMED(
            ...     ['string', 'int_array'],
            ...     [Types.STRING(), Types.LIST(Types.INT())]
            ... )
            >>> sink = FileSink.for_bulk_format(
            ...     OUTPUT_DIR, ParquetBulkWriter.for_row_type(
            ...         row_type,
            ...         hadoop_config=Configuration(),
            ...         utc_timestamp=True,
            ...     )
            ... ).build()
            >>> ds.map(lambda e: e, output_type=row_type_info).sink_to(sink)

        Note that in the above example, an identity map to indicate its RowTypeInfo is necessary
        before ``sink_to`` when ``ds`` is a source stream producing **RowData** records, because
        RowDataBulkWriterFactory assumes the input record type is **Row** .
        """
        if not hadoop_config:
            hadoop_config = Configuration()

        jvm = get_gateway().jvm
        JParquetRowDataBuilder = jvm.org.apache.flink.formats.parquet.row.ParquetRowDataBuilder
        return RowDataBulkWriterFactory(
            JParquetRowDataBuilder.createWriterFactory(
                _to_java_data_type(row_type).getLogicalType(),
                create_hadoop_configuration(hadoop_config), utc_timestamp),
            row_type)
Пример #29
0
        def column(self, column_name: str,
                   data_type: Union[str, DataType]) -> 'Schema.Builder':
            """
            Declares a physical column that is appended to this schema.

            Physical columns are regular columns known from databases. They define the names, the
            types, and the order of fields in the physical data. Thus, physical columns represent
            the payload that is read from and written to an external system. Connectors and formats
            use these columns (in the defined order) to configure themselves. Other kinds of columns
            can be declared between physical columns but will not influence the final physical
            schema.

            :param column_name: Column name
            :param data_type: Data type of the column
            """
            if isinstance(data_type, str):
                self._j_builder.column(column_name, data_type)
            else:
                self._j_builder.column(column_name,
                                       _to_java_data_type(data_type))
            return self
Пример #30
0
        def column_by_metadata(self,
                               column_name: str,
                               data_type: Union[DataType, str],
                               metadata_key: str = None,
                               is_virtual: bool = False) -> 'Schema.Builder':
            """
            Declares a metadata column that is appended to this schema.

            Metadata columns allow to access connector and/or format specific fields for every row
            of a table. For example, a metadata column can be used to read and write the timestamp
            from and to Kafka records for time-based operations. The connector and format
            documentation lists the available metadata fields for every component.

            Every metadata field is identified by a string-based key and has a documented data
            type. The metadata key can be omitted if the column name should be used as the
            identifying metadata key. For convenience, the runtime will perform an explicit cast if
            the data type of the column differs from the data type of the metadata field. Of course,
            this requires that the two data types are compatible.

            By default, a metadata column can be used for both reading and writing. However, in
            many cases an external system provides more read-only metadata fields than writable
            fields. Therefore, it is possible to exclude metadata columns from persisting by setting
            the {@code is_virtual} flag to {@code true}.

            :param column_name: Column name
            :param data_type: Data type of the column
            :param metadata_key: Identifying metadata key, if null the column name will be used as
                metadata key
            :param is_virtual: Whether the column should be persisted or not
            """
            if isinstance(data_type, DataType):
                self._j_builder.columnByMetadata(column_name,
                                                 _to_java_data_type(data_type),
                                                 metadata_key, is_virtual)
            else:
                self._j_builder.columnByMetadata(column_name, data_type,
                                                 metadata_key, is_virtual)
            return self