def kafka_wrapper(
    kafka: KafkaStruct,
    process: Callable[[List[DataFrame]], DataFrame],
    inputs: List[InputStruct],
    spark: SparkSession,
) -> DataFrame:
    """
    Read data from kafka
    ...

    Attributes
    ----------
    kafka: kafka parameters
    process: function to apply to dataframes
    inputs: for each topic, input parameters
    spark: the instantiated sparksession
    """
    confluent_config = get_confluent_config(kafka.brokers, prefix="kafka.")

    dfs = [
        spark.readStream.format("kafka").option("startingOffsets",
                                                "earliest").option(
                                                    "failOnDataLoss", "false").
        option("subscribe", input.topic).options(**confluent_config).option(
            "kafka.sasl.jaas.config",
            "org.apache.kafka.common.security.plain.PlainLoginModule required username='******' password='******';"
            .format(kafka.confluent_api_key, kafka.confluent_secret),
        ).load().selectExpr("CAST(value AS STRING) as json").select(
            F.from_json(
                F.col("json"),
                schema=map_avro_to_spark_schema(
                    input.topic_schema)).alias("data")).select("data.*")
        for input in inputs
    ]
    return process(dfs).withColumn("topic_timestamp", F.current_timestamp())
def test_get_confluent_config_only_broker():
    brokers = "brokers"
    expected = {
        "bootstrap.servers": "brokers",
        "security.protocol": "SASL_SSL",
        "sasl.mechanism": "PLAIN",
        "ssl.endpoint.identification.algorithm": "https",
    }
    assert get_confluent_config(brokers) == expected
def test_get_confluent_config_broker_and_prefix():
    brokers = "brokers"
    prefix = "kafka."
    expected = {
        "kafka.bootstrap.servers": "brokers",
        "kafka.security.protocol": "SASL_SSL",
        "kafka.sasl.mechanism": "PLAIN",
        "kafka.ssl.endpoint.identification.algorithm": "https",
    }
    assert get_confluent_config(brokers, prefix=prefix) == expected
def main():
    _, app_name, kafka_struct, firestore_version = sys.argv
    kafka_info = KafkaStruct(**json.loads(kafka_struct))
    admin_client = AdminClient(
        get_confluent_config(
            kafka_info.brokers,
            api_key=kafka_info.confluent_api_key,
            secret=kafka_info.confluent_secret,
        ))
    create_topics(admin_client, app_name, firestore_version)
def test_get_confluent_config_broker_and_key_secret():
    brokers = "brokers"
    api_key = "api"
    secret = "secret"
    expected = {
        "bootstrap.servers": "brokers",
        "security.protocol": "SASL_SSL",
        "sasl.mechanism": "PLAIN",
        "ssl.endpoint.identification.algorithm": "https",
        "sasl.username": "******",
        "sasl.password": "******",
    }
    assert get_confluent_config(brokers, api_key=api_key,
                                secret=secret) == expected
def write_kafka(batch_df: DataFrame, kafka: KafkaStruct, app_name: str,
                version: str):
    """
    Write spark streaming back to Kafka
    ...

    Attributes
    ----------
    batch_df: dataframe to write
    kafka: kafka settings including key and secret
    app_name: name of streaming application
    version: code version as defined in FirestoreOutputStruct
    """
    confluent_config = get_confluent_config(kafka.brokers, prefix="kafka.")
    batch_df.select(
        F.to_json(F.struct(*batch_df.columns)).alias("value")
    ).write.format("kafka").options(**confluent_config).option(
        "kafka.sasl.jaas.config",
        "org.apache.kafka.common.security.plain.PlainLoginModule required username='******' password='******';"
        .format(kafka.confluent_api_key, kafka.confluent_secret),
    ).option("topic", get_kafka_output_topic_from_app_name(app_name,
                                                           version)).save()