Exemplo n.º 1
0
    def from_source(self,
                    source: Source,
                    watermark_strategy: WatermarkStrategy,
                    source_name: str,
                    type_info: TypeInformation = None) -> 'DataStream':
        """
        Adds a data :class:`~pyflink.datastream.connectors.Source` to the environment to get a
        :class:`~pyflink.datastream.DataStream`.

        The result will be either a bounded data stream (that can be processed in a batch way) or
        an unbounded data stream (that must be processed in a streaming way), based on the
        boundedness property of the source.

        This method takes an explicit type information for the produced data stream, so that
        callers can define directly what type/serializer will be used for the produced stream. For
        sources that describe their produced type, the parameter type_info should not be specified
        to avoid specifying the produced type redundantly.

        .. versionadded:: 1.13.0
        """
        if type_info:
            j_type_info = type_info.get_java_type_info()
        else:
            j_type_info = None
        j_data_stream = self._j_stream_execution_environment.fromSource(
            source.get_java_function(),
            watermark_strategy._j_watermark_strategy,
            source_name,
            j_type_info)
        return DataStream(j_data_stream=j_data_stream)
Exemplo n.º 2
0
    def key_by(self, key_selector: Union[Callable, KeySelector],
               key_type_info: TypeInformation = None) -> 'KeyedStream':
        """
        Creates a new KeyedStream that uses the provided key for partitioning its operator states.

        :param key_selector: The KeySelector to be used for extracting the key for partitioning.
        :param key_type_info: The type information describing the key type.
        :return: The DataStream with partitioned state(i.e. KeyedStream).
        """
        if callable(key_selector):
            key_selector = KeySelectorFunctionWrapper(key_selector)
        if not isinstance(key_selector, (KeySelector, KeySelectorFunctionWrapper)):
            raise TypeError("Parameter key_selector should be a type of KeySelector.")

        gateway = get_gateway()
        PickledKeySelector = gateway.jvm \
            .org.apache.flink.datastream.runtime.functions.python.PickledKeySelector
        j_output_type_info = self._j_data_stream.getTransformation().getOutputType()
        output_type_info = typeinfo._from_java_type(j_output_type_info)
        is_key_pickled_byte_array = False
        if key_type_info is None:
            key_type_info = Types.PICKLED_BYTE_ARRAY()
            is_key_pickled_byte_array = True

        intermediate_map_stream = self.map(lambda x: (key_selector.get_key(x), x),
                                           type_info=Types.ROW([key_type_info, output_type_info]))
        intermediate_map_stream.name(gateway.jvm.org.apache.flink.python.util.PythonConfigUtil
                                     .STREAM_KEY_BY_MAP_OPERATOR_NAME)
        generated_key_stream = KeyedStream(intermediate_map_stream._j_data_stream
                                           .keyBy(PickledKeySelector(is_key_pickled_byte_array),
                                                  key_type_info.get_java_type_info()), self)
        generated_key_stream._original_data_type_info = output_type_info
        return generated_key_stream
Exemplo n.º 3
0
def from_type_info(type_info: TypeInformation) -> FieldCoder:
    """
    Mappings from type_info to Coder
    """

    if isinstance(type_info, PickledBytesTypeInfo):
        return PickleCoder()
    elif isinstance(type_info, BasicTypeInfo):
        return _basic_type_info_mappings[type_info._basic_type]
    elif isinstance(type_info, DateTypeInfo):
        return DateCoder()
    elif isinstance(type_info, TimeTypeInfo):
        return TimeCoder()
    elif isinstance(type_info, TimestampTypeInfo):
        return TimestampCoder(3)
    elif isinstance(type_info, PrimitiveArrayTypeInfo):
        element_type = type_info._element_type
        if isinstance(
                element_type,
                BasicTypeInfo) and element_type._basic_type == BasicType.BYTE:
            return BinaryCoder()
        else:
            return PrimitiveArrayCoder(from_type_info(element_type))
    elif isinstance(type_info, (BasicArrayTypeInfo, ObjectArrayTypeInfo)):
        return GenericArrayCoder(from_type_info(type_info._element_type))
    elif isinstance(type_info, ListTypeInfo):
        return GenericArrayCoder(from_type_info(type_info.elem_type))
    elif isinstance(type_info, MapTypeInfo):
        return MapCoder(from_type_info(type_info._key_type_info),
                        from_type_info(type_info._value_type_info))
    elif isinstance(type_info, TupleTypeInfo):
        return TupleCoder([
            from_type_info(field_type)
            for field_type in type_info.get_field_types()
        ])
    elif isinstance(type_info, RowTypeInfo):
        return RowCoder(
            [from_type_info(f) for f in type_info.get_field_types()],
            [f for f in type_info.get_field_names()])
    elif isinstance(type_info, ExternalTypeInfo):
        return from_type_info(type_info._type_info)
    elif isinstance(type_info, GenericRecordAvroTypeInfo):
        return AvroCoder(type_info._schema)
    else:
        raise ValueError("Unsupported type_info %s." % type_info)
Exemplo n.º 4
0
 def __init__(self, type_info: TypeInformation):
     if type_info is None:
         raise TypeError("Type information must not be None")
     if isinstance(type_info, WrapperTypeInfo):
         self._j_builder = get_gateway().jvm\
             .org.apache.flink.formats.csv.CsvRowSerializationSchema.Builder(
             type_info.get_java_type_info())
     else:
         raise ValueError('type_info must be WrapperTypeInfo')
Exemplo n.º 5
0
    def from_collection(self, collection: List[Any],
                        type_info: TypeInformation = None) -> DataStream:
        """
        Creates a data stream from the given non-empty collection. The type of the data stream is
        that of the elements in the collection.

        Note that this operation will result in a non-parallel data stream source, i.e. a data
        stream source with parallelism one.

        :param collection: The collection of elements to create the data stream from.
        :param type_info: The TypeInformation for the produced data stream
        :return: the data stream representing the given collection.
        """
        if type_info is not None:
            collection = [type_info.to_internal_type(element) for element in collection]
        return self._from_collection(collection, type_info)
Exemplo n.º 6
0
    def add_source(self, source_func: SourceFunction, source_name: str = 'Custom Source',
                   type_info: TypeInformation = None) -> 'DataStream':
        """
        Adds a data source to the streaming topology.

        :param source_func: the user defined function.
        :param source_name: name of the data source. Optional.
        :param type_info: type of the returned stream. Optional.
        :return: the data stream constructed.
        """
        j_type_info = type_info.get_java_type_info() if type_info is not None else None
        j_data_stream = self._j_stream_execution_environment.addSource(source_func
                                                                       .get_java_function(),
                                                                       source_name,
                                                                       j_type_info)
        return DataStream(j_data_stream=j_data_stream)
Exemplo n.º 7
0
 def __init__(self, type_info: TypeInformation):
     if type_info is None:
         raise TypeError("Type information must not be None")
     self._j_builder = get_gateway().jvm\
         .org.apache.flink.formats.csv.CsvRowSerializationSchema.Builder(
         type_info.get_java_type_info())