def test_group_status(spark: SparkSession, df_group_status: DataFrame) -> None: from pyspark.sql import functions as F from pyspark.sql.types import BooleanType df_group_status.show() df_group_status.printSchema() df_enrich: DataFrame = df_group_status \ .withColumn("cond1", when(col("dt") >= to_date(lit('2020-01-01'), 'yyyy-MM-dd'), lit(True)).otherwise(lit(False))) \ .withColumn("cond2", when(col("dt") >= to_date(lit('2021-01-01'), 'yyyy-MM-dd'), lit(True)).otherwise(lit(False))) df_enrich.show() df_enrich.printSchema() df_enrich_further: DataFrame = df_enrich.groupBy("grp") \ .agg(F.collect_set("cond1"), F.collect_set("cond2")).toDF(*["grp", "cond1_set", "cond2_set"]) df_enrich_further.show() df_enrich_further.printSchema() df_final: DataFrame = df_enrich_further.withColumn("from_cond1_set", ~F.array_contains(F.col("cond1_set"), False)) \ .withColumn("from_cond2_set", ~F.array_contains(F.col("cond2_set"), False)) df_final.show() df_final.printSchema() df_final: DataFrame = df_final.drop(*["cond1_set", "cond2_set"]) df_enrich: DataFrame = df_enrich.drop(*["cond1", "cond2"]) df_enrich.join(df_final, df_enrich["grp"] == df_final["grp"], "inner").show()
def print_data_info(data: DataFrame, file_name: str = '', isDetailed: bool = False): """ Prints spark i94project frame description :param isDetailed: :param file_name: :param data: spark i94project frame :return: none """ # if verbose_mode: print('----------------------------------------') print(f'\r| Data {file_name} info:') print('\r| Schema') data.printSchema() if isDetailed: print('\r| Types') print(data.dtypes) # print('\r| Describe') # print(data.describe().show()) print('\r| First rows') data.show(n=10) print('\r| Row count: {}'.format(data.count())) print('----------------------------------------') print('\n')