def consume_raw( kafka_config: dict, cdc_spec: CDCSpec, partitions=None, table_type: TableType = TableType.stream(), ) -> Table: """ Consume the raw events from a Change Data Capture (CDC) Kafka stream to a Deephaven table. Args: kafka_config (Dict): configuration for the associated kafka consumer and also the resulting table. Passed to the org.apache.kafka.clients.consumer.KafkaConsumer constructor; pass any KafkaConsumer specific desired configuration here. Note this should include the relevant property for a schema server URL where the key and/or value Avro necessary schemas are stored. cdc_spec (CDCSpec): a CDCSpec obtained from calling either the cdc_long_spec or the cdc_short_spec function partitions (List[int]: a list of integer partition numbers, default is None indicating all partitions table_type (TableType): a TableType enum, default is TableType.stream() Returns: a Deephaven live table for the raw CDC events Raises: DHError """ try: partitions = j_partitions(partitions) kafka_config = j_properties(kafka_config) table_type_enum = table_type.value return Table(j_table=_JCdcTools.consumeRawToTable( kafka_config, cdc_spec.j_object, partitions, table_type_enum)) except Exception as e: raise DHError(e, "failed to consume a raw CDC stream.") from e
def produce( table: Table, kafka_config: Dict, topic: str, key_spec: KeyValueSpec, value_spec: KeyValueSpec, last_by_key_columns: bool = False, ) -> Callable[[], None]: """Produce to Kafka from a Deephaven table. Args: table (Table): the source table to publish to Kafka kafka_config (Dict): configuration for the associated kafka producer topic (str): the topic name key_spec (KeyValueSpec): specifies how to map table column(s) to the Key field in produced Kafka messages. This should be the result of calling one of the functions simple_spec(), avro_spec() or json_spec() in this module, or the constant KeyValueSpec.IGNORE value_spec (KeyValueSpec): specifies how to map table column(s) to the Value field in produced Kafka messages. This should be the result of calling one of the functions simple_spec(), avro_spec() or json_spec() in this, or the constant KeyValueSpec.IGNORE last_by_key_columns (bool): whether to publish only the last record for each unique key, Ignored if key_spec is KeyValueSpec.IGNORE. Otherwise, if last_by_key_columns is true this method will internally perform a last_by aggregation on table grouped by the input columns of key_spec and publish to Kafka from the result. Returns: a callback that, when invoked, stops publishing and cleans up subscriptions and resources. Users should hold to this callback to ensure liveness for publishing for as long as this publishing is desired, and once not desired anymore they should invoke it Raises: DHError """ try: if key_spec is KeyValueSpec.IGNORE and value_spec is KeyValueSpec.IGNORE: raise ValueError( "at least one argument for 'key_spec' or 'value_spec' must be different from KeyValueSpec.IGNORE" ) kafka_config = j_properties(kafka_config) runnable = _JKafkaTools.produceFromTable( table.j_table, kafka_config, topic, key_spec.j_object, value_spec.j_object, last_by_key_columns, ) def cleanup(): try: runnable.run() except Exception as ex: raise DHError( ex, "failed to stop publishing to Kafka and the clean-up." ) from ex return cleanup except Exception as e: raise DHError(e, "failed to start producing Kafka messages.") from e
def consume( kafka_config: Dict, cdc_spec: CDCSpec, partitions: List[int] = None, stream_table: bool = False, cols_to_drop: List[str] = None, ) -> Table: """ Consume from a Change Data Capture (CDC) Kafka stream (as, eg, produced by Debezium), tracking the underlying database table to a Deephaven table. Args: kafka_config (Dict): configuration for the associated kafka consumer and also the resulting table. Passed to the org.apache.kafka.clients.consumer.KafkaConsumer constructor; pass any KafkaConsumer specific desired configuration here. Note this should include the relevant property for a schema server URL where the key and/or value Avro necessary schemas are stored. cdc_spec (CDCSpec): a CDCSpec obtained from calling either the cdc_long_spec or the cdc_short_spec function partitions (List[int]: a list of integer partition numbers, default is None indicating all partitions stream_table (bool): if true, produce a streaming table of changed rows keeping the CDC 'op' column indicating the type of column change; if false, return a Deephaven ticking table that tracks the underlying database table through the CDC Stream. cols_to_drop (list[str]): a list of column names to omit from the resulting DHC table. Note that only columns not included in the primary key for the table can be dropped at this stage; you can chain a drop column operation after this call if you need to do this. Returns: a Deephaven live table that will update based on the CDC messages consumed for the given topic Raises: DHError """ try: partitions = j_partitions(partitions) kafka_config = j_properties(kafka_config) return Table(j_table=_JCdcTools.consumeToTable( kafka_config, cdc_spec.j_object, partitions, stream_table, cols_to_drop)) except Exception as e: raise DHError(e, "failed to consume a CDC stream.") from e
def consume( kafka_config: Dict, topic: str, partitions: List[int] = None, offsets: Dict[int, int] = None, key_spec: KeyValueSpec = None, value_spec: KeyValueSpec = None, table_type: TableType = TableType.stream(), ) -> Table: """Consume from Kafka to a Deephaven table. Args: kafka_config (Dict): configuration for the associated Kafka consumer and also the resulting table. Once the table-specific properties are stripped, the remaining one is used to call the constructor of org.apache.kafka.clients.consumer.KafkaConsumer; pass any KafkaConsumer specific desired configuration here topic (str): the Kafka topic name partitions (List[int]) : a list of integer partition numbers, default is None which means all partitions offsets (Dict[int, int]) : a mapping between partition numbers and offset numbers, and can be one of the predefined ALL_PARTITIONS_SEEK_TO_BEGINNING, ALL_PARTITIONS_SEEK_TO_END or ALL_PARTITIONS_DONT_SEEK. The default is None which works the same as ALL_PARTITIONS_DONT_SEEK. The offset numbers may be one of the predefined SEEK_TO_BEGINNING, SEEK_TO_END, or DONT_SEEK. key_spec (KeyValueSpec): specifies how to map the Key field in Kafka messages to Deephaven column(s). It can be the result of calling one of the functions: simple_spec(),avro_spec() or json_spec() in this module, or the predefined KeyValueSpec.IGNORE or KeyValueSpec.FROM_PROPERTIES. The default is None which works the same as KeyValueSpec.FROM_PROPERTIES, in which case, the kafka_config param should include values for dictionary keys 'deephaven.key.column.name' and 'deephaven.key.column.type', for the single resulting column name and type value_spec (KeyValueSpec): specifies how to map the Value field in Kafka messages to Deephaven column(s). It can be the result of calling one of the functions: simple_spec(),avro_spec() or json_spec() in this module, or the predefined KeyValueSpec.IGNORE or KeyValueSpec.FROM_PROPERTIES. The default is None which works the same as KeyValueSpec.FROM_PROPERTIES, in which case, the kafka_config param should include values for dictionary keys 'deephaven.key.column.name' and 'deephaven.key.column.type', for the single resulting column name and type table_type (TableType): a TableType enum, default is TableType.stream() Returns: a Deephaven live table that will update based on Kafka messages consumed for the given topic Raises: DHError """ try: partitions = j_partitions(partitions) if offsets is None or offsets == ALL_PARTITIONS_DONT_SEEK: offsets = _ALL_PARTITIONS_DONT_SEEK elif offsets == ALL_PARTITIONS_SEEK_TO_BEGINNING: offsets = _ALL_PARTITIONS_SEEK_TO_BEGINNING elif offsets == ALL_PARTITIONS_SEEK_TO_END: offsets = _ALL_PARTITIONS_SEEK_TO_END else: partitions_array = jpy.array("int", list(offsets.keys())) offsets_array = jpy.array("long", list(offsets.values())) offsets = _JKafkaTools.partitionToOffsetFromParallelArrays( partitions_array, offsets_array) key_spec = KeyValueSpec.FROM_PROPERTIES if key_spec is None else key_spec value_spec = KeyValueSpec.FROM_PROPERTIES if value_spec is None else value_spec if key_spec is KeyValueSpec.IGNORE and value_spec is KeyValueSpec.IGNORE: raise ValueError( "at least one argument for 'key' or 'value' must be different from KeyValueSpec.IGNORE" ) kafka_config = j_properties(kafka_config) return Table(j_table=_JKafkaTools.consumeToTable( kafka_config, topic, partitions, offsets, key_spec.j_object, value_spec.j_object, table_type.j_object, )) except Exception as e: raise DHError(e, "failed to consume a Kafka stream.") from e
def avro_spec( schema: str, schema_version: str = "latest", field_to_col_mapping: Dict[str, str] = None, timestamp_field: str = None, include_only_columns: List[str] = None, exclude_columns: List[str] = None, publish_schema: bool = False, schema_namespace: str = None, column_properties: Dict[str, str] = None, ) -> KeyValueSpec: """Creates a spec for how to use an Avro schema to produce a Kafka stream from a Deephaven table. Args: schema (str): the name for a schema registered in a Confluent compatible Schema Server. The associated 'kafka_config' parameter in the call to produce() should include the key 'schema.registry.url' with the value of the Schema Server URL for fetching the schema definition schema_version (str): the schema version to fetch from schema service, default is 'latest' field_to_col_mapping (Dict[str, str]): a mapping from Avro field names in the schema to column names in the Deephaven table. Any fields in the schema not present in the dict as keys are mapped to columns of the same name. The default is None, meaning all schema fields are mapped to columns of the same name. timestamp_field (str): the name of an extra timestamp field to be included in the produced Kafka message body, it is used mostly for debugging slowdowns, default is None. include_only_columns (List[str]): the list of column names in the source table to include in the generated output, default is None. When not None, the 'exclude_columns' parameter must be None exclude_columns (List[str]): the list of column names to exclude from the generated output (every other column will be included), default is None. When not None, the 'include_only_columns' must be None publish_schema (bool): when True, publish the given schema name to Schema Registry Server, according to an Avro schema generated from the table definition, for the columns and fields implied by field_to_col_mapping, include_only_columns, and exclude_columns; if a schema_version is provided and the resulting version after publishing does not match, an exception results. The default is False. schema_namespace (str): when 'publish_schema' is True, the namespace for the generated schema to be registered in the Schema Registry Server. column_properties (Dict[str, str]): when 'publish_schema' is True, specifies the properties of the columns implying particular Avro type mappings for them. In particular, column X of BigDecimal type should specify properties 'x.precision' and 'x.scale'. Returns: a KeyValueSpec Raises: DHError """ try: field_to_col_mapping = j_hashmap(field_to_col_mapping) column_properties = j_properties(column_properties) include_only_columns = j_hashset(include_only_columns) include_only_columns = _JKafkaTools.predicateFromSet( include_only_columns) exclude_columns = j_hashset(exclude_columns) exclude_columns = _JKafkaTools.predicateFromSet(exclude_columns) return KeyValueSpec( _JKafkaTools_Produce.avroSpec( schema, schema_version, field_to_col_mapping, timestamp_field, include_only_columns, exclude_columns, publish_schema, schema_namespace, column_properties, )) except Exception as e: raise DHError(e, "failed to create a Kafka key/value spec.") from e