示例#1
0
def consume_raw(
        kafka_config: dict,
        cdc_spec: CDCSpec,
        partitions=None,
        table_type: TableType = TableType.stream(),
) -> Table:
    """ Consume the raw events from a Change Data Capture (CDC) Kafka stream to a Deephaven table.

    Args:
        kafka_config (Dict): configuration for the associated kafka consumer and also the resulting table. Passed
            to the org.apache.kafka.clients.consumer.KafkaConsumer constructor; pass any KafkaConsumer specific desired
            configuration here. Note this should include the relevant property for a schema server URL where the key
            and/or value Avro necessary schemas are stored.
        cdc_spec (CDCSpec): a CDCSpec obtained from calling either the cdc_long_spec or the cdc_short_spec function
        partitions (List[int]: a list of integer partition numbers, default is None indicating all partitions
        table_type (TableType): a TableType enum, default is TableType.stream()

    Returns:
        a Deephaven live table for the raw CDC events

    Raises:
        DHError
    """
    try:
        partitions = j_partitions(partitions)
        kafka_config = j_properties(kafka_config)
        table_type_enum = table_type.value
        return Table(j_table=_JCdcTools.consumeRawToTable(
            kafka_config, cdc_spec.j_object, partitions, table_type_enum))
    except Exception as e:
        raise DHError(e, "failed to consume a raw CDC stream.") from e
示例#2
0
def produce(
    table: Table,
    kafka_config: Dict,
    topic: str,
    key_spec: KeyValueSpec,
    value_spec: KeyValueSpec,
    last_by_key_columns: bool = False,
) -> Callable[[], None]:
    """Produce to Kafka from a Deephaven table.

    Args:
        table (Table): the source table to publish to Kafka
        kafka_config (Dict): configuration for the associated kafka producer
        topic (str): the topic name
        key_spec (KeyValueSpec): specifies how to map table column(s) to the Key field in produced Kafka messages.
            This should be the result of calling one of the functions simple_spec(), avro_spec() or json_spec() in this
            module, or the constant KeyValueSpec.IGNORE
        value_spec (KeyValueSpec): specifies how to map table column(s) to the Value field in produced Kafka messages.
            This should be the result of calling one of the functions simple_spec(), avro_spec() or json_spec() in this,
            or the constant KeyValueSpec.IGNORE
        last_by_key_columns (bool): whether to publish only the last record for each unique key, Ignored if key_spec is
            KeyValueSpec.IGNORE. Otherwise, if last_by_key_columns is true this method will internally perform a last_by
            aggregation on table grouped by the input columns of key_spec and publish to Kafka from the result.

    Returns:
        a callback that, when invoked, stops publishing and cleans up subscriptions and resources.
        Users should hold to this callback to ensure liveness for publishing for as long as this
        publishing is desired, and once not desired anymore they should invoke it

    Raises:
        DHError
    """
    try:
        if key_spec is KeyValueSpec.IGNORE and value_spec is KeyValueSpec.IGNORE:
            raise ValueError(
                "at least one argument for 'key_spec' or 'value_spec' must be different from KeyValueSpec.IGNORE"
            )

        kafka_config = j_properties(kafka_config)
        runnable = _JKafkaTools.produceFromTable(
            table.j_table,
            kafka_config,
            topic,
            key_spec.j_object,
            value_spec.j_object,
            last_by_key_columns,
        )

        def cleanup():
            try:
                runnable.run()
            except Exception as ex:
                raise DHError(
                    ex, "failed to stop publishing to Kafka and the clean-up."
                ) from ex

        return cleanup
    except Exception as e:
        raise DHError(e, "failed to start producing Kafka messages.") from e
示例#3
0
def consume(
    kafka_config: Dict,
    cdc_spec: CDCSpec,
    partitions: List[int] = None,
    stream_table: bool = False,
    cols_to_drop: List[str] = None,
) -> Table:
    """ Consume from a Change Data Capture (CDC) Kafka stream (as, eg, produced by Debezium), tracking the underlying
    database table to a Deephaven table.

    Args:
        kafka_config (Dict): configuration for the associated kafka consumer and also the resulting table. Passed
            to the org.apache.kafka.clients.consumer.KafkaConsumer constructor; pass any KafkaConsumer specific desired
            configuration here. Note this should include the relevant property for a schema server URL where the key
            and/or value Avro necessary schemas are stored.
        cdc_spec (CDCSpec): a CDCSpec obtained from calling either the cdc_long_spec or the cdc_short_spec function
        partitions (List[int]: a list of integer partition numbers, default is None indicating all partitions
        stream_table (bool):  if true, produce a streaming table of changed rows keeping the CDC 'op' column
            indicating the type of column change; if false, return a Deephaven ticking table that tracks the underlying
            database table through the CDC Stream.
        cols_to_drop (list[str]): a list of column names to omit from the resulting DHC table. Note that only columns
            not included in the primary key for the table can be dropped at this stage; you can chain a drop column
            operation after this call if you need to do this.

    Returns:
        a Deephaven live table that will update based on the CDC messages consumed for the given topic

    Raises:
        DHError
    """
    try:
        partitions = j_partitions(partitions)
        kafka_config = j_properties(kafka_config)
        return Table(j_table=_JCdcTools.consumeToTable(
            kafka_config, cdc_spec.j_object, partitions, stream_table,
            cols_to_drop))
    except Exception as e:
        raise DHError(e, "failed to consume a CDC stream.") from e
示例#4
0
def consume(
        kafka_config: Dict,
        topic: str,
        partitions: List[int] = None,
        offsets: Dict[int, int] = None,
        key_spec: KeyValueSpec = None,
        value_spec: KeyValueSpec = None,
        table_type: TableType = TableType.stream(),
) -> Table:
    """Consume from Kafka to a Deephaven table.

    Args:
        kafka_config (Dict): configuration for the associated Kafka consumer and also the resulting table.
            Once the table-specific properties are stripped, the remaining one is used to call the constructor of
            org.apache.kafka.clients.consumer.KafkaConsumer; pass any KafkaConsumer specific desired configuration here
        topic (str): the Kafka topic name
        partitions (List[int]) : a list of integer partition numbers, default is None which means all partitions
        offsets (Dict[int, int]) : a mapping between partition numbers and offset numbers, and can be one of the
            predefined ALL_PARTITIONS_SEEK_TO_BEGINNING, ALL_PARTITIONS_SEEK_TO_END or ALL_PARTITIONS_DONT_SEEK.
            The default is None which works the same as  ALL_PARTITIONS_DONT_SEEK. The offset numbers may be one
            of the predefined SEEK_TO_BEGINNING, SEEK_TO_END, or DONT_SEEK.
        key_spec (KeyValueSpec): specifies how to map the Key field in Kafka messages to Deephaven column(s).
            It can be the result of calling one of the functions: simple_spec(),avro_spec() or json_spec() in this
            module, or the predefined KeyValueSpec.IGNORE or KeyValueSpec.FROM_PROPERTIES. The default is None which
            works the same as KeyValueSpec.FROM_PROPERTIES, in which case, the kafka_config param should include values
            for dictionary keys 'deephaven.key.column.name' and 'deephaven.key.column.type', for the single resulting
            column name and type
        value_spec (KeyValueSpec): specifies how to map the Value field in Kafka messages to Deephaven column(s).
            It can be the result of calling one of the functions: simple_spec(),avro_spec() or json_spec() in this
            module, or the predefined KeyValueSpec.IGNORE or KeyValueSpec.FROM_PROPERTIES. The default is None which
            works the same as KeyValueSpec.FROM_PROPERTIES, in which case, the kafka_config param should include values
            for dictionary keys 'deephaven.key.column.name' and 'deephaven.key.column.type', for the single resulting
            column name and type
        table_type (TableType): a TableType enum, default is TableType.stream()

    Returns:
        a Deephaven live table that will update based on Kafka messages consumed for the given topic

    Raises:
        DHError
    """

    try:
        partitions = j_partitions(partitions)

        if offsets is None or offsets == ALL_PARTITIONS_DONT_SEEK:
            offsets = _ALL_PARTITIONS_DONT_SEEK
        elif offsets == ALL_PARTITIONS_SEEK_TO_BEGINNING:
            offsets = _ALL_PARTITIONS_SEEK_TO_BEGINNING
        elif offsets == ALL_PARTITIONS_SEEK_TO_END:
            offsets = _ALL_PARTITIONS_SEEK_TO_END
        else:
            partitions_array = jpy.array("int", list(offsets.keys()))
            offsets_array = jpy.array("long", list(offsets.values()))
            offsets = _JKafkaTools.partitionToOffsetFromParallelArrays(
                partitions_array, offsets_array)

        key_spec = KeyValueSpec.FROM_PROPERTIES if key_spec is None else key_spec
        value_spec = KeyValueSpec.FROM_PROPERTIES if value_spec is None else value_spec

        if key_spec is KeyValueSpec.IGNORE and value_spec is KeyValueSpec.IGNORE:
            raise ValueError(
                "at least one argument for 'key' or 'value' must be different from KeyValueSpec.IGNORE"
            )

        kafka_config = j_properties(kafka_config)
        return Table(j_table=_JKafkaTools.consumeToTable(
            kafka_config,
            topic,
            partitions,
            offsets,
            key_spec.j_object,
            value_spec.j_object,
            table_type.j_object,
        ))
    except Exception as e:
        raise DHError(e, "failed to consume a Kafka stream.") from e
示例#5
0
def avro_spec(
    schema: str,
    schema_version: str = "latest",
    field_to_col_mapping: Dict[str, str] = None,
    timestamp_field: str = None,
    include_only_columns: List[str] = None,
    exclude_columns: List[str] = None,
    publish_schema: bool = False,
    schema_namespace: str = None,
    column_properties: Dict[str, str] = None,
) -> KeyValueSpec:
    """Creates a spec for how to use an Avro schema to produce a Kafka stream from a Deephaven table.

    Args:
        schema (str):  the name for a schema registered in a Confluent compatible Schema Server. The associated
            'kafka_config' parameter in the call to produce() should include the key 'schema.registry.url' with
            the value of the Schema Server URL for fetching the schema definition
        schema_version (str): the schema version to fetch from schema service, default is 'latest'
        field_to_col_mapping (Dict[str, str]): a mapping from Avro field names in the schema to column names in
            the Deephaven table. Any fields in the schema not present in the dict as keys are mapped to columns of the
            same name. The default is None, meaning all schema fields are mapped to columns of the same name.
        timestamp_field (str): the name of an extra timestamp field to be included in the produced Kafka message body,
            it is used mostly for debugging slowdowns,  default is None.
        include_only_columns (List[str]): the list of column names in the source table to include in the generated
            output, default is None. When not None, the 'exclude_columns' parameter must be None
        exclude_columns (List[str]):  the list of column names to exclude from the generated output (every other column
            will be included), default is None. When not None, the 'include_only_columns' must be None
        publish_schema (bool): when True, publish the given schema name to Schema Registry Server, according to an Avro
            schema generated from the table definition, for the columns and fields implied by field_to_col_mapping,
            include_only_columns, and exclude_columns; if a schema_version is provided and the resulting version after
            publishing does not match, an exception results. The default is False.
        schema_namespace (str): when 'publish_schema' is True, the namespace for the generated schema to be registered
            in the Schema Registry Server.
        column_properties (Dict[str, str]): when 'publish_schema' is True, specifies the properties of the columns
            implying particular Avro type mappings for them. In particular, column X of BigDecimal type should specify
            properties 'x.precision' and 'x.scale'.

    Returns:
        a KeyValueSpec

    Raises:
        DHError
    """
    try:
        field_to_col_mapping = j_hashmap(field_to_col_mapping)
        column_properties = j_properties(column_properties)
        include_only_columns = j_hashset(include_only_columns)
        include_only_columns = _JKafkaTools.predicateFromSet(
            include_only_columns)
        exclude_columns = j_hashset(exclude_columns)
        exclude_columns = _JKafkaTools.predicateFromSet(exclude_columns)

        return KeyValueSpec(
            _JKafkaTools_Produce.avroSpec(
                schema,
                schema_version,
                field_to_col_mapping,
                timestamp_field,
                include_only_columns,
                exclude_columns,
                publish_schema,
                schema_namespace,
                column_properties,
            ))
    except Exception as e:
        raise DHError(e, "failed to create a Kafka key/value spec.") from e