def main(spark: pyspark.sql.SparkSession): try: sdf = spark.readStream.format('kafka') \ .option('kafka.bootstrap.servers', 'localhost:9092') \ .option('subscribe', 'lcr-events') \ .option("startingOffsets", "earliest") \ .load() sdf.select(from_json(col('value').cast("string"), schema=LCR_RESULT_SCHEMA).alias('data')) \ .select('data.*')\ .writeStream \ .format('console') \ .start() spark.streams.awaitAnyTermination() finally: spark.stop()
def test_deduplicate_no_keys(spark_session: pyspark.sql.SparkSession): df = spark_session.createDataFrame( [(1, "Account_1", 30.5), (1, "Account_1", 30.6), (1, "Account_2", 30.6), (1, "Account_1", 30.5)], ['id', 'account', 'score'] ) deduplicator = Deduplicator() actual_df = deduplicator.deduplicate([], df).collect() assert actual_df == [(1, "Account_1", 30.5), (1, "Account_1", 30.6), (1, "Account_2", 30.6)]
def test_simple(spark_session: pyspark.sql.SparkSession): df_1 = spark_session.createDataFrame([(1, 'qwerty123'), (2, 'asdfgh123')], ['id', 'string']) remover = IllegalCharRemover(['1', '2', '3'], '') df_2 = remover.remove_illegal_chars(df_1, 'string', 'string_filtered') fields = [field.name for field in df_2.schema.fields] assert fields == ['id', 'string_filtered'] assert df_2.collect() == [(1, 'qwerty'), (2, 'asdfgh')]
def test_remove_special(spark_session: pyspark.sql.SparkSession): df_1 = spark_session.createDataFrame([(1, 'qwerty{\\/[]}^'), (2, 'asdfgh')], ['id', 'string']) remover = IllegalCharRemover(['^', '\\', '/', '[', ']', '{', '}'], '') df_2 = remover.remove_illegal_chars(df_1, 'string', 'string_filtered') fields = [field.name for field in df_2.schema.fields] assert fields == ['id', 'string_filtered'] assert df_2.collect() == [(1, 'qwerty'), (2, 'asdfgh')]
def iris_spark( iris: pd.DataFrame, spark_session: pyspark.sql.SparkSession) -> pyspark.sql.DataFrame: return spark_session.createDataFrame(iris)