def for_schema(schema: 'CsvSchema') -> 'CsvReaderFormat': """ Builds a :class:`CsvReaderFormat` using `CsvSchema`. """ jvm = get_gateway().jvm jackson = jvm.org.apache.flink.shaded.jackson2.com.fasterxml.jackson constructor = get_java_class(jvm.org.apache.flink.formats.csv.CsvReaderFormat) \ .getDeclaredConstructor( to_jarray(jvm.Class, [ get_java_class(jackson.dataformat.csv.CsvMapper), get_java_class(jackson.dataformat.csv.CsvSchema), get_java_class(jvm.Class), get_java_class(jvm.org.apache.flink.formats.common.Converter), get_java_class(jvm.org.apache.flink.api.common.typeinfo.TypeInformation), get_java_class(jvm.boolean) ]) ) constructor.setAccessible(True) j_csv_format = constructor.newInstance( to_jarray(jvm.Object, [ jackson.dataformat.csv.CsvMapper(), schema._j_schema, get_java_class(jackson.databind.JsonNode), jvm.org.apache.flink.formats.csv.CsvToRowDataConverters( False).createRowConverter( _to_java_data_type(schema._data_type).getLogicalType(), True), jvm.org.apache.flink.table.runtime.typeutils.InternalTypeInfo. of(_to_java_data_type( schema._data_type).getLogicalType()), False ])) return CsvReaderFormat(j_csv_format)
def __init__(self, field_names, field_types, path, field_delimiter=',', num_files=-1, write_mode=None): gateway = get_gateway() if write_mode == WriteMode.NO_OVERWRITE: j_write_mode = gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.NO_OVERWRITE elif write_mode == WriteMode.OVERWRITE: j_write_mode = gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.OVERWRITE elif write_mode is None: j_write_mode = None else: raise Exception('Unsupported write_mode: %s' % write_mode) j_field_names = java_utils.to_jarray(gateway.jvm.String, field_names) j_field_types = java_utils.to_jarray( gateway.jvm.DataType, [_to_java_data_type(field_type) for field_type in field_types]) j_csv_table_sink = gateway.jvm.CsvTableSink(path, field_delimiter, num_files, j_write_mode, j_field_names, j_field_types) super(CsvTableSink, self).__init__(j_csv_table_sink)
def call(f: Union[str, UserDefinedFunctionWrapper], *args) -> Expression: """ The first parameter `f` could be a str or a Python user-defined function. When it is str, this is a call to a function that will be looked up in a catalog. There are two kinds of functions: - System functions - which are identified with one part names - Catalog functions - which are identified always with three parts names (catalog, database, function) Moreover each function can either be a temporary function or permanent one (which is stored in an external catalog). Based on that two properties the resolution order for looking up a function based on the provided `function_name` is following: - Temporary system function - System function - Temporary catalog function - Catalog function :param f: the path of the function or the Python user-defined function. :param args: parameters of the user-defined function. """ gateway = get_gateway() if isinstance(f, str): return Expression(gateway.jvm.Expressions.call( f, to_jarray(gateway.jvm.Object, [_get_java_expression(arg) for arg in args]))) def get_function_definition(f): if isinstance(f, UserDefinedTableFunctionWrapper): """ TypeInference was not supported for TableFunction in the old planner. Use TableFunctionDefinition to work around this issue. """ j_result_types = to_jarray(gateway.jvm.TypeInformation, [_to_java_type(i) for i in f._result_types]) j_result_type = gateway.jvm.org.apache.flink.api.java.typeutils.RowTypeInfo( j_result_types) return gateway.jvm.org.apache.flink.table.functions.TableFunctionDefinition( 'f', f._java_user_defined_function(), j_result_type) else: return f._java_user_defined_function() expressions_clz = load_java_class("org.apache.flink.table.api.Expressions") function_definition_clz = load_java_class('org.apache.flink.table.functions.FunctionDefinition') j_object_array_type = to_jarray(gateway.jvm.Object, []).getClass() api_call_method = expressions_clz.getDeclaredMethod( "apiCall", to_jarray(gateway.jvm.Class, [function_definition_clz, j_object_array_type])) api_call_method.setAccessible(True) return Expression(api_call_method.invoke( None, to_jarray(gateway.jvm.Object, [get_function_definition(f), to_jarray(gateway.jvm.Object, [_get_java_expression(arg) for arg in args])])))
def __init__(self, j_table_sink, field_names, field_types): gateway = get_gateway() j_field_names = java_utils.to_jarray(gateway.jvm.String, field_names) j_field_types = java_utils.to_jarray( gateway.jvm.TypeInformation, [_to_java_type(field_type) for field_type in field_types]) j_table_sink = j_table_sink.configure(j_field_names, j_field_types) super(TestTableSink, self).__init__(j_table_sink)
def _get_kafka_source_configuration(source: KafkaSource): jvm = get_gateway().jvm j_source = source.get_java_function() j_to_configuration = j_source.getClass().getDeclaredMethod( 'getConfiguration', to_jarray(jvm.java.lang.Class, [])) j_to_configuration.setAccessible(True) j_configuration = j_to_configuration.invoke( j_source, to_jarray(jvm.java.lang.Object, [])) return Configuration(j_configuration=j_configuration)
def __init__(self, field_names, field_types): TestTableSink._ensure_initialized() gateway = get_gateway() j_field_names = java_utils.to_jarray(gateway.jvm.String, field_names) j_field_types = java_utils.to_jarray( gateway.jvm.DataType, [_to_java_data_type(field_type) for field_type in field_types]) super(TestRetractSink, self).__init__( gateway.jvm.org.apache.flink.table.utils.TestingSinks. TestAppendingSink(j_field_names, j_field_types))
def from_fields(self, field_names: List[str], field_data_types: List[DataType]) -> 'Schema.Builder': """ Adopts the given field names and field data types as physical columns of the schema. """ gateway = get_gateway() j_field_names = to_jarray(gateway.jvm.String, field_names) j_field_data_types = to_jarray(gateway.jvm.AbstractDataType, [ _to_java_data_type(field_data_type) for field_data_type in field_data_types ]) self._j_builder.fromFields(j_field_names, j_field_data_types) return self
def __init__(self, field_names: List[str] = None, data_types: List[DataType] = None, j_table_schema=None): if j_table_schema is None: gateway = get_gateway() j_field_names = to_jarray(gateway.jvm.String, field_names) j_data_types = to_jarray( gateway.jvm.TypeInformation, [_to_java_type(item) for item in data_types]) self._j_table_schema = gateway.jvm.TableSchema( j_field_names, j_data_types) else: self._j_table_schema = j_table_schema
def _create_judf(self, serialized_func, j_input_types, j_function_kind): if self._func_type == "pandas": from pyflink.table.types import DataTypes self._accumulator_type = DataTypes.ARRAY(self._result_type) if j_input_types is not None: gateway = get_gateway() j_input_types = java_utils.to_jarray( gateway.jvm.DataType, [_to_java_data_type(i) for i in self._input_types]) j_result_type = _to_java_data_type(self._result_type) j_accumulator_type = _to_java_data_type(self._accumulator_type) gateway = get_gateway() if self._is_table_aggregate: PythonAggregateFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonTableAggregateFunction else: PythonAggregateFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonAggregateFunction j_aggregate_function = PythonAggregateFunction( self._name, bytearray(serialized_func), j_input_types, j_result_type, j_accumulator_type, j_function_kind, self._deterministic, self._takes_row_as_input, _get_python_env()) return j_aggregate_function
def _java_user_defined_function(self): if self._judf_placeholder is None: gateway = get_gateway() def get_python_function_kind(): JPythonFunctionKind = gateway.jvm.org.apache.flink.table.functions.python. \ PythonFunctionKind if self._func_type == "general": return JPythonFunctionKind.GENERAL elif self._func_type == "pandas": return JPythonFunctionKind.PANDAS else: raise TypeError("Unsupported func_type: %s." % self._func_type) if self._input_types is not None: j_input_types = java_utils.to_jarray( gateway.jvm.DataType, [_to_java_data_type(i) for i in self._input_types]) else: j_input_types = None j_function_kind = get_python_function_kind() func = self._func if not isinstance(self._func, UserDefinedFunction): func = self._create_delegate_function() import cloudpickle serialized_func = cloudpickle.dumps(func) self._judf_placeholder = \ self._create_judf(serialized_func, j_input_types, j_function_kind) return self._judf_placeholder
def for_bulk_file_format(bulk_format: BulkFormat, *paths: str) -> FileSourceBuilder: JPath = get_gateway().jvm.org.apache.flink.core.fs.Path JFileSource = get_gateway( ).jvm.org.apache.flink.connector.file.src.FileSource j_paths = to_jarray(JPath, [JPath(p) for p in paths]) return FileSourceBuilder( JFileSource.forBulkFileFormat(bulk_format._j_bulk_format, j_paths))
def or_(predicate0: Union[bool, Expression[bool]], predicate1: Union[bool, Expression[bool]], *predicates: Union[bool, Expression[bool]]) -> Expression[bool]: """ Boolean OR in three-valued logic. """ gateway = get_gateway() predicates = to_jarray(gateway.jvm.Object, [_get_java_expression(p) for p in predicates]) return _ternary_op("or", predicate0, predicate1, predicates)
def partitioned_by(self, *partition_keys: str) -> 'TableDescriptor.Builder': """ Define which columns this table is partitioned by. """ gateway = get_gateway() self._j_builder.partitionedBy( to_jarray(gateway.jvm.java.lang.String, partition_keys)) return self
def concat(first: Union[str, Expression[str]], *others: Union[str, Expression[str]]) -> Expression[str]: """ Returns the string that results from concatenating the arguments. Returns NULL if any argument is NULL. """ gateway = get_gateway() return _binary_op( "concat", first, to_jarray(gateway.jvm.Object, [_get_java_expression(other) for other in others]))
def for_row_type(row_type: RowType, writer_properties: Optional[Configuration] = None, hadoop_config: Optional[Configuration] = None) \ -> BulkWriterFactory: """ Create a RowDataBulkWriterFactory that writes Row records with a defined RowType into Orc files in a batch fashion. Example: :: >>> row_type = DataTypes.ROW([ ... DataTypes.FIELD('string', DataTypes.STRING()), ... DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT())) ... ]) >>> row_type_info = Types.ROW_NAMED( ... ['string', 'int_array'], ... [Types.STRING(), Types.LIST(Types.INT())] ... ) >>> sink = FileSink.for_bulk_format( ... OUTPUT_DIR, OrcBulkWriters.for_row_type( ... row_type=row_type, ... writer_properties=Configuration(), ... hadoop_config=Configuration(), ... ) ... ).build() >>> ds.map(lambda e: e, output_type=row_type_info).sink_to(sink) Note that in the above example, an identity map to indicate its RowTypeInfo is necessary before ``sink_to`` when ``ds`` is a source stream producing **RowData** records, because RowDataBulkWriterFactory assumes the input record type is Row. """ if not isinstance(row_type, RowType): raise TypeError('row_type must be an instance of RowType') j_data_type = _to_java_data_type(row_type) jvm = get_gateway().jvm j_row_type = j_data_type.getLogicalType() orc_types = to_jarray( jvm.org.apache.flink.table.types.logical.LogicalType, [i for i in j_row_type.getChildren()]) type_description = jvm.org.apache.flink.orc \ .OrcSplitReaderUtil.logicalTypeToOrcType(j_row_type) if writer_properties is None: writer_properties = Configuration() if hadoop_config is None: hadoop_config = Configuration() return RowDataBulkWriterFactory( jvm.org.apache.flink.orc.writer.OrcBulkWriterFactory( jvm.org.apache.flink.orc.vector.RowDataVectorizer( type_description.toString(), orc_types), create_java_properties(writer_properties), create_hadoop_configuration(hadoop_config)), row_type)
def set_topics(self, *topics: str) -> 'KafkaSourceBuilder': """ Set a list of topics the KafkaSource should consume from. All the topics in the list should have existed in the Kafka cluster. Otherwise, an exception will be thrown. To allow some topics to be created lazily, please use :meth:`set_topic_pattern` instead. :param topics: the list of topics to consume from. :return: this KafkaSourceBuilder. """ self._j_builder.setTopics( to_jarray(get_gateway().jvm.java.lang.String, topics)) return self
def array(head, *tail) -> Expression: """ Creates an array of literals. Example: :: >>> tab.select(array(1, 2, 3)) """ gateway = get_gateway() tail = to_jarray(gateway.jvm.Object, [_get_java_expression(t) for t in tail]) return _binary_op("array", head, tail)
def row(head, *tail) -> Expression: """ Creates a row of expressions. Example: :: >>> tab.select(row("key1", 1)) """ gateway = get_gateway() tail = to_jarray(gateway.jvm.Object, [_get_java_expression(t) for t in tail]) return _binary_op("row", head, tail)
def sink(sql: str, type_info: RowTypeInfo, jdbc_connection_options: 'JdbcConnectionOptions', jdbc_execution_options: 'JdbcExecutionOptions' = None): """ Create a JDBC sink. :param sql: arbitrary DML query (e.g. insert, update, upsert) :param type_info: A RowTypeInfo for query field types. :param jdbc_execution_options: parameters of execution, such as batch size and maximum retries. :param jdbc_connection_options: parameters of connection, such as JDBC URL. :return: A JdbcSink. """ sql_types = [] gateway = get_gateway() JJdbcTypeUtil = gateway.jvm.org.apache.flink.connector.jdbc.utils.JdbcTypeUtil for field_type in type_info.get_field_types(): sql_types.append( JJdbcTypeUtil.typeInformationToSqlType( field_type.get_java_type_info())) j_sql_type = to_jarray(gateway.jvm.int, sql_types) output_format_clz = gateway.jvm.Class\ .forName('org.apache.flink.connector.jdbc.internal.JdbcBatchingOutputFormat', False, get_gateway().jvm.Thread.currentThread().getContextClassLoader()) j_int_array_type = to_jarray(gateway.jvm.int, []).getClass() j_builder_method = output_format_clz.getDeclaredMethod( 'createRowJdbcStatementBuilder', to_jarray(gateway.jvm.Class, [j_int_array_type])) j_builder_method.setAccessible(True) j_statement_builder = j_builder_method.invoke( None, to_jarray(gateway.jvm.Object, [j_sql_type])) jdbc_execution_options = jdbc_execution_options if jdbc_execution_options is not None \ else JdbcExecutionOptions.defaults() j_jdbc_sink = gateway.jvm.org.apache.flink.connector.jdbc.JdbcSink\ .sink(sql, j_statement_builder, jdbc_execution_options._j_jdbc_execution_options, jdbc_connection_options._j_jdbc_connection_options) return JdbcSink(j_jdbc_sink=j_jdbc_sink)
def set_hosts( self, hosts: Union[str, List[str]]) -> 'ElasticsearchSinkBuilderBase': """ Sets the hosts where the Elasticsearch cluster nodes are reachable. """ if not isinstance(hosts, list): hosts = [hosts] JHttpHost = self.get_http_host_class() j_http_hosts_list = [JHttpHost.create(x) for x in hosts] j_http_hosts_array = to_jarray(JHttpHost, j_http_hosts_list) self._j_elasticsearch_sink_builder.setHosts(j_http_hosts_array) return self
def alias(self, name: str, *extra_names: str) -> 'Expression[T]': """ Specifies a name for an expression i.e. a field. Example: :: >>> tab.select(col('a').alias('b')) :param name: name for one field. :param extra_names: additional names if the expression expands to multiple fields """ gateway = get_gateway() return _ternary_op("as")(self, name, to_jarray(gateway.jvm.String, extra_names))
def get_function_definition(f): if isinstance(f, UserDefinedTableFunctionWrapper): """ TypeInference was not supported for TableFunction in the old planner. Use TableFunctionDefinition to work around this issue. """ j_result_types = to_jarray(gateway.jvm.TypeInformation, [_to_java_type(i) for i in f._result_types]) j_result_type = gateway.jvm.org.apache.flink.api.java.typeutils.RowTypeInfo( j_result_types) return gateway.jvm.org.apache.flink.table.functions.TableFunctionDefinition( 'f', f._java_user_defined_function(), j_result_type) else: return f._java_user_defined_function()
def invoke_java_object_method(obj, method_name): clz = obj.getClass() j_method = None while clz is not None: try: j_method = clz.getDeclaredMethod(method_name, None) if j_method is not None: break except: clz = clz.getSuperclass() if j_method is None: raise Exception("No such method: " + method_name) j_method.setAccessible(True) return j_method.invoke(obj, to_jarray(get_gateway().jvm.Object, []))
def primary_key(self, *column_names: str) -> 'Schema.Builder': """ Declares a primary key constraint for a set of given columns. Primary key uniquely identify a row in a table. Neither of columns in a primary can be nullable. The primary key is informational only. It will not be enforced. It can be used for optimizations. It is the data owner's responsibility to ensure uniqueness of the data. The primary key will be assigned a generated name in the format {@code PK_col1_col2}. :param column_names: Columns that form a unique primary key """ gateway = get_gateway() self._j_builder.primaryKey( to_jarray(gateway.jvm.java.lang.String, column_names)) return self
def add_jars_to_context_class_loader(jar_urls): """ Add jars to Python gateway server for local compilation and local execution (i.e. minicluster). There are many component in Flink which won't be added to classpath by default. e.g. Kafka connector, JDBC connector, CSV format etc. This utility function can be used to hot load the jars. :param jar_urls: The list of jar urls. """ gateway = get_gateway() # validate and normalize jar_urls = [gateway.jvm.java.net.URL(url) for url in jar_urls] context_classloader = gateway.jvm.Thread.currentThread().getContextClassLoader() existing_urls = [] class_loader_name = context_classloader.getClass().getName() if class_loader_name == "java.net.URLClassLoader": existing_urls = set([url.toString() for url in context_classloader.getURLs()]) if all([url.toString() in existing_urls for url in jar_urls]): # if urls all existed, no need to create new class loader. return URLClassLoaderClass = load_java_class("java.net.URLClassLoader") addURL = URLClassLoaderClass.getDeclaredMethod( "addURL", to_jarray( gateway.jvm.Class, [load_java_class("java.net.URL")])) addURL.setAccessible(True) if class_loader_name == "org.apache.flink.runtime.execution.librarycache." \ "FlinkUserCodeClassLoaders$SafetyNetWrapperClassLoader": ensureInner = context_classloader.getClass().getDeclaredMethod("ensureInner", None) ensureInner.setAccessible(True) loader = ensureInner.invoke(context_classloader, None) else: loader = context_classloader for url in jar_urls: addURL.invoke(loader, to_jarray(get_gateway().jvm.Object, [url]))
def with_columns(head, *tails) -> Expression: """ Creates an expression that selects a range of columns. It can be used wherever an array of expression is accepted such as function calls, projections, or groupings. A range can either be index-based or name-based. Indices start at 1 and boundaries are inclusive. e.g. with_columns(range_("b", "c")) or with_columns(col("*")) .. seealso:: :func:`~pyflink.table.expressions.range_`, :func:`~pyflink.table.expressions.without_columns` """ gateway = get_gateway() tails = to_jarray(gateway.jvm.Object, [_get_java_expression(t) for t in tails]) return _binary_op("withColumns", head, tails)
def _create_judf(self, serialized_func, j_input_types, j_function_kind): gateway = get_gateway() j_result_types = java_utils.to_jarray( gateway.jvm.DataType, [_to_java_data_type(i) for i in self._result_types]) j_result_type = gateway.jvm.DataTypes.ROW(j_result_types) PythonTableFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonTableFunction j_table_function = PythonTableFunction(self._name, bytearray(serialized_func), j_input_types, j_result_type, j_function_kind, self._deterministic, self._takes_row_as_input, _get_python_env()) return j_table_function
def primary_key_named(self, constraint_name: str, *column_names: str) -> 'Schema.Builder': """ Declares a primary key constraint for a set of given columns. Primary key uniquely identify a row in a table. Neither of columns in a primary can be nullable. The primary key is informational only. It will not be enforced. It can be used for optimizations. It is the data owner's responsibility to ensure uniqueness of the data. :param constraint_name: Name for the primary key, can be used to reference the constraint :param column_names: Columns that form a unique primary key """ gateway = get_gateway() self._j_builder.primaryKeyNamed( constraint_name, to_jarray(gateway.jvm.java.lang.String, column_names)) return self
def concat_ws(separator: Union[str, Expression[str]], first: Union[str, Expression[str]], *others: Union[str, Expression[str]]) -> Expression[str]: """ Returns the string that results from concatenating the arguments and separator. Returns NULL If the separator is NULL. .. note:: this function does not skip empty strings. However, it does skip any NULL values after the separator argument. """ gateway = get_gateway() return _ternary_op( "concatWs", separator, first, to_jarray(gateway.jvm.Object, [_get_java_expression(other) for other in others]))
def for_record_stream_format(stream_format: StreamFormat, *paths: str) -> FileSourceBuilder: """ Builds a new FileSource using a :class:`~FileSource.StreamFormat` to read record-by-record from a file stream. When possible, stream-based formats are generally easier (preferable) to file-based formats, because they support better default behavior around I/O batching or progress tracking (checkpoints). Stream formats also automatically de-compress files based on the file extension. This supports files ending in ".deflate" (Deflate), ".xz" (XZ), ".bz2" (BZip2), ".gz", ".gzip" (GZip). """ JPath = get_gateway().jvm.org.apache.flink.core.fs.Path JFileSource = get_gateway().jvm.org.apache.flink.connector.file.src.FileSource j_paths = to_jarray(JPath, [JPath(p) for p in paths]) return FileSourceBuilder( JFileSource.forRecordStreamFormat(stream_format._j_stream_format, j_paths))