def kafka_wrapper( kafka: KafkaStruct, process: Callable[[List[DataFrame]], DataFrame], inputs: List[InputStruct], spark: SparkSession, ) -> DataFrame: """ Read data from kafka ... Attributes ---------- kafka: kafka parameters process: function to apply to dataframes inputs: for each topic, input parameters spark: the instantiated sparksession """ confluent_config = get_confluent_config(kafka.brokers, prefix="kafka.") dfs = [ spark.readStream.format("kafka").option("startingOffsets", "earliest").option( "failOnDataLoss", "false"). option("subscribe", input.topic).options(**confluent_config).option( "kafka.sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='******' password='******';" .format(kafka.confluent_api_key, kafka.confluent_secret), ).load().selectExpr("CAST(value AS STRING) as json").select( F.from_json( F.col("json"), schema=map_avro_to_spark_schema( input.topic_schema)).alias("data")).select("data.*") for input in inputs ] return process(dfs).withColumn("topic_timestamp", F.current_timestamp())
def test_get_confluent_config_only_broker(): brokers = "brokers" expected = { "bootstrap.servers": "brokers", "security.protocol": "SASL_SSL", "sasl.mechanism": "PLAIN", "ssl.endpoint.identification.algorithm": "https", } assert get_confluent_config(brokers) == expected
def test_get_confluent_config_broker_and_prefix(): brokers = "brokers" prefix = "kafka." expected = { "kafka.bootstrap.servers": "brokers", "kafka.security.protocol": "SASL_SSL", "kafka.sasl.mechanism": "PLAIN", "kafka.ssl.endpoint.identification.algorithm": "https", } assert get_confluent_config(brokers, prefix=prefix) == expected
def main(): _, app_name, kafka_struct, firestore_version = sys.argv kafka_info = KafkaStruct(**json.loads(kafka_struct)) admin_client = AdminClient( get_confluent_config( kafka_info.brokers, api_key=kafka_info.confluent_api_key, secret=kafka_info.confluent_secret, )) create_topics(admin_client, app_name, firestore_version)
def test_get_confluent_config_broker_and_key_secret(): brokers = "brokers" api_key = "api" secret = "secret" expected = { "bootstrap.servers": "brokers", "security.protocol": "SASL_SSL", "sasl.mechanism": "PLAIN", "ssl.endpoint.identification.algorithm": "https", "sasl.username": "******", "sasl.password": "******", } assert get_confluent_config(brokers, api_key=api_key, secret=secret) == expected
def write_kafka(batch_df: DataFrame, kafka: KafkaStruct, app_name: str, version: str): """ Write spark streaming back to Kafka ... Attributes ---------- batch_df: dataframe to write kafka: kafka settings including key and secret app_name: name of streaming application version: code version as defined in FirestoreOutputStruct """ confluent_config = get_confluent_config(kafka.brokers, prefix="kafka.") batch_df.select( F.to_json(F.struct(*batch_df.columns)).alias("value") ).write.format("kafka").options(**confluent_config).option( "kafka.sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='******' password='******';" .format(kafka.confluent_api_key, kafka.confluent_secret), ).option("topic", get_kafka_output_topic_from_app_name(app_name, version)).save()