示例#1
0
def test_group_status(spark: SparkSession, df_group_status: DataFrame) -> None:
    from pyspark.sql import functions as F
    from pyspark.sql.types import BooleanType

    df_group_status.show()
    df_group_status.printSchema()

    df_enrich: DataFrame = df_group_status \
                    .withColumn("cond1", when(col("dt") >= to_date(lit('2020-01-01'), 'yyyy-MM-dd'), lit(True)).otherwise(lit(False))) \
                    .withColumn("cond2", when(col("dt") >= to_date(lit('2021-01-01'), 'yyyy-MM-dd'), lit(True)).otherwise(lit(False)))

    df_enrich.show()
    df_enrich.printSchema()

    df_enrich_further: DataFrame = df_enrich.groupBy("grp") \
                        .agg(F.collect_set("cond1"), F.collect_set("cond2")).toDF(*["grp", "cond1_set", "cond2_set"])

    df_enrich_further.show()
    df_enrich_further.printSchema()

    df_final: DataFrame = df_enrich_further.withColumn("from_cond1_set", ~F.array_contains(F.col("cond1_set"), False)) \
                        .withColumn("from_cond2_set", ~F.array_contains(F.col("cond2_set"), False))

    df_final.show()
    df_final.printSchema()

    df_final: DataFrame = df_final.drop(*["cond1_set", "cond2_set"])
    df_enrich: DataFrame = df_enrich.drop(*["cond1", "cond2"])

    df_enrich.join(df_final, df_enrich["grp"] == df_final["grp"],
                   "inner").show()
def print_data_info(data: DataFrame,
                    file_name: str = '',
                    isDetailed: bool = False):
    """
        Prints spark i94project frame description

        :param isDetailed:
        :param file_name:
        :param data: spark i94project frame
        :return: none
    """
    # if verbose_mode:
    print('----------------------------------------')
    print(f'\r| Data {file_name} info:')
    print('\r| Schema')
    data.printSchema()
    if isDetailed:
        print('\r| Types')
        print(data.dtypes)
        # print('\r| Describe')
        # print(data.describe().show())
        print('\r| First rows')
        data.show(n=10)
        print('\r| Row count: {}'.format(data.count()))

    print('----------------------------------------')
    print('\n')