def get_df_main_datasets_never_read(df_dataset_file_rse_ts_size,
                                    filtered_rses_id_name_map, min_tb_limit):
    """Get never accessed datasets' dataframes"""
    reverted_filtered_rses_id_name_map = get_reverted_rses_id_name_map(
        filtered_rses_id_name_map)
    df_sub_datasets_never_read = df_dataset_file_rse_ts_size \
        .groupby(['rse_id', 'dataset']) \
        .agg(_round(_sum(col('f_size')) / TB_DENOMINATOR, 5).alias('dataset_size_in_rse_tb'),
             _max(col('accessed_at')).alias('last_access_time_of_dataset_in_rse'),
             _max(col('created_at')).alias('last_create_time_of_dataset_in_rse'),
             _count(lit(1)).alias('#_files_of_dataset_in_rse'),
             ) \
        .filter(col('last_access_time_of_dataset_in_rse').isNull()) \
        .filter(col('dataset_size_in_rse_tb') > min_tb_limit) \
        .replace(reverted_filtered_rses_id_name_map, subset=['rse_id']) \
        .withColumnRenamed('rse_id', 'RSE name') \
        .select(['RSE name',
                 'dataset',
                 'dataset_size_in_rse_tb',
                 'last_create_time_of_dataset_in_rse',
                 '#_files_of_dataset_in_rse'
                 ]) \
        .cache()

    df_main_datasets_never_read = df_sub_datasets_never_read \
        .groupby(['dataset']) \
        .agg(_max(col('dataset_size_in_rse_tb')).alias('max_dataset_size_in_rses(TB)'),
             _min(col('dataset_size_in_rse_tb')).alias('min_dataset_size_in_rses(TB)'),
             _avg(col('dataset_size_in_rse_tb')).alias('avg_dataset_size_in_rses(TB)'),
             _sum(col('dataset_size_in_rse_tb')).alias('sum_dataset_size_in_rses(TB)'),
             _max(col('last_create_time_of_dataset_in_rse')).alias('last_create_time_of_dataset_in_all_rses'),
             concat_ws(', ', collect_list('RSE name')).alias('RSE(s)'),
             ) \
        .cache()
    return df_main_datasets_never_read, df_sub_datasets_never_read
def get_device_tracking_observations(df):
    """
    Calculates first_observed_at and last_observed_at for each device from the supplied observations.

    :param df: A DataFrame of network connection observations.
    :return: A DataFrame of (organization, mac, first_observed_at, last_observed_at)
    """

    if df and df.columns:
        outbound_df = df.select(
            'organization',
            col('network_src_mac').name('mac'),
            'occurred_at'
        )

        inbound_df = df.select(
            'organization',
            col('network_dest_mac').name('mac'),
            'occurred_at'
        )

        return outbound_df.union(
            inbound_df
        ).where(
            col('mac').isNotNull()
        ).groupBy(
            'organization',
            'mac'
        ).agg(
            _min('occurred_at').name('first_observed_at'),
            _max('occurred_at').name('last_observed_at'),
        )
Exemplo n.º 3
0
def get_prcp_day(df_in):
    # Efetua o agrupamento dos valores, efetuando a agregacao de determinados campos
    print('CONSTRUINDO DATAFRAME COM VALORES AGREGADOS POR DIA')
    df_out = df_in.groupby('city', 'yr', 'mo', 'da').agg(
        _abs(_max('lat')).alias('latitude'),
        _sum('prcp').alias('prcp_dia'),
        _max('tmax').alias('tmax'),
        _min('tmin').alias('tmin'),
        _mean('temp').alias('med_temp')).orderBy('city', 'yr', 'mo', 'da')
    return df_out
def get_df_main_not_read_since(df_sub_not_read_since):
    """Get dataframe of datasets not read since N months for main htmls.

    Get last access of dataframe in all RSE(s)
    """
    return df_sub_not_read_since \
        .groupby(['dataset']) \
        .agg(_max(col('dataset_size_in_rse_tb')).alias('max_dataset_size_in_rses(TB)'),
             _min(col('dataset_size_in_rse_tb')).alias('min_dataset_size_in_rses(TB)'),
             _avg(col('dataset_size_in_rse_tb')).alias('avg_dataset_size_in_rses(TB)'),
             _sum(col('dataset_size_in_rse_tb')).alias('sum_dataset_size_in_rses(TB)'),
             _max(col('last_access_time_of_dataset_in_rse')).alias('last_access_time_of_dataset_in_all_rses'),
             concat_ws(', ', collect_list('RSE name')).alias('RSE(s)'),
             ) \
        .cache()
Exemplo n.º 5
0
def get_crab_popularity_ds(start_date,
                           end_date,
                           verbose=False,
                           base=_BASE_PATH):
    """
    Query the hdfs data and returns a pandas dataframe with:
    Datatier, Dataset, CMSPrimaryPrimaryDataset, job_count, workflow_count, ChirpCMSSWReadBytes
    args:
        - start_date datetime Start of the query period (RecordTime)
        - end_date datetime End of the query period
    """
    start = int(start_date.timestamp() * 1000)
    end = int(end_date.timestamp() * 1000)
    spark = get_spark_session(yarn=True, verbose=verbose)

    dfs_crabdb = (spark.read.option("basePath", base).json(
        _get_candidate_files(start_date, end_date, spark, base=base),
        schema=_get_crab_condor_schema(),
    ).select("metadata.timestamp",
             "data.*").filter("""Status in ('Completed', 'Removed') AND
                              CRAB_DataBlock is not NULL  AND
                              timestamp >= {} AND
                              timestamp <= {}""".format(
                 start, end)).repartition("CRAB_DataBlock").drop_duplicates([
                     "GlobalJobId"
                 ]).withColumnRenamed(
                     "CMSPrimaryPrimaryDataset", "PrimaryDataset").withColumn(
                         "Dataset",
                         regexp_extract("CRAB_DataBlock", "^(.*)/([^/]*)#.*$",
                                        1)).withColumn(
                                            "Datatier",
                                            regexp_extract(
                                                "CRAB_DataBlock",
                                                "^(.*)/([^/]*)#.*$", 2)))
    dfs_crabdb = (dfs_crabdb.groupBy(
        "Datatier", "PrimaryDataset", "Dataset").agg(
            _max(col("RecordTime")),
            _min(col("RecordTime")),
            count(lit(1)),
            countDistinct("CRAB_Workflow"),
            _sum(col("ChirpCMSSWReadBytes")),
        ).withColumnRenamed("count(1)", "job_count").withColumnRenamed(
            "count(DISTINCT CRAB_Workflow)",
            "workflow_count").withColumnRenamed(
                "sum(ChirpCMSSWReadBytes)", "ChirpCMSSWReadBytes").na.fill(
                    "Unknown", ["Datatier", "PrimaryDataset", "Dataset"]))
    return dfs_crabdb.toPandas()
schema = t.StructType() \
        .add("time", t.StringType(), True) \
        .add("open", t.DoubleType(), True) \
        .add("close", t.DoubleType(), True) \
        .add("high", t.DoubleType(), True) \
        .add("low", t.DoubleType(), True) \
        .add("volume", t.DoubleType(), True) \
        .add("input_file", t.StringType(), True)
df = spark.read.format(file_type).options(header="true",inferSchema="true").schema(schema).load(file_location).withColumn("input_file", input_file_name())

# Get and split file name to create a column with the coin pair corresponding for each row
split_col = split(df['input_file'], '/')
df = df.withColumn('coin_pair', split(split_col.getItem(3),'\.').getItem(0))
df = df.drop("input_file")

# We have a timestamp and we want a date
df = df.withColumn('Date', from_unixtime((col('time')/1000)))

# Agregate data to have a daily result, ready to insert into the database
df = df.groupBy("coin_pair",window("Date","1 day")) \
          .agg(_sum('volume'), last('close')-first('open'), _min('low'), _max('high') ) \
          .select(col("coin_pair"), \
                  to_date(col("window.start")).alias("date"), \
                  col("sum(volume)").alias("dailyVolume"), \
                  col("(last(close) - first(open))").alias("dailyResult"), \
                  col("min(low)").alias("dailyLower"), \
                  col("max(high)").alias("dailyHigher")) \
          .orderBy("coin_pair","date")

# Write data into the MongoDB database coins and daily_coins_data collection
df.write.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://XXXXXXX/coins.daily_coins_data").mode("append").save()
Exemplo n.º 7
0
# Define schema an retrieve data from the blob storage
schema = t.StructType() \
        .add("time", t.StringType(), True) \
        .add("open", t.DoubleType(), True) \
        .add("close", t.DoubleType(), True) \
        .add("high", t.DoubleType(), True) \
        .add("low", t.DoubleType(), True) \
        .add("volume", t.DoubleType(), True) \
        .add("input_file", t.StringType(), True)
df = spark.read.format(file_type).options(
    header="true",
    inferSchema="true").schema(schema).load(file_location).withColumn(
        "input_file", input_file_name())

# Get and split file name to create a column with the coin pair corresponding for each row
split_col = split(df['input_file'], '/')
df = df.withColumn('coin_pair', split(split_col.getItem(3), '\.').getItem(0))
df = df.drop("input_file")

# We have a timestamp and we want a date
df = df.withColumn('Date', from_unixtime((col('time') / 1000)))

# Agregate data to have a monthly result, ready to insert into the database
df = df.withColumn("date",to_date(concat_ws("-",year('Date'),month('Date')))) \
          .groupBy("coin_pair","date") \
          .agg(_sum('volume').alias("monthlyVolume"), (last('close')-first('open')).alias("monthlyResult"), _min('low').alias("monthlyLower"), _max('high').alias("monthlyHigher")) \
          .orderBy("coin_pair","date")

# Write data into the MongoDB database coins and monthly_coins_data collection
df.write.format("com.mongodb.spark.sql.DefaultSource").option(
    "uri", "mongodb://XXXXXXX/coins.monthly_coins_data").mode("append").save()
Exemplo n.º 8
0
def useSpark(sourceFile: str, targetTsvFile: str) -> None:
    """[Process the input source files using Spark to transform to target data]

    Args:
        sourceFile (str): [Path to the location of the input data]
        targetTsvFile (str): [Path to the location of the target data]
    """

    # secrets for access to postgres database are held in .env file
    # this loads that into the application environment
    load_dotenv(verbose=True)

    spark = SparkSession.builder \
        .appName('Aquis2') \
        .master("local[2]") \
        .config(conf=getSparkConf(getJars())) \
        .getOrCreate()

    # clean data from source file
    cleanDf = spark.read.text(sourceFile) \
        .filter(col("value").contains("msgType_") & ~col("value").contains('msgType_":11')) \
        .withColumn("value", expr("substring(value,2)")) \
        .withColumn("value", regexp_replace("value", '\{\{', r'\{"header":\{')) \
        .withColumn("value", regexp_replace("value", 'SELL,', '"SELL",')) \
        .withColumn("value", regexp_replace("value", 'BUY,', '"BUY",')) \
        .withColumn("value", regexp_replace("value", '"flags_":"\{"', '"flags_":\{"'))

    # figure out schema on message 8, keep for re-use later as a technology demonstration
    msg8Schema = spark.read.json(
        cleanDf.filter(col("value").contains('"msgType_":8')).select(
            col("value").cast("string")).rdd.map(
                lambda r: r.value))._jdf.schema().toDDL()
    msg8Df = cleanDf.filter(col("value").contains('"msgType_":8')).withColumn("value", from_json("value", msg8Schema)) \
        .select("value.security_.securityId_", "value.security_.isin_", "value.security_.currency_") \
        .repartition(2, ["securityId_"])
    # msg8Df.printSchema()
    # root
    # | -- securityId_: long(nullable=true)
    # | -- isin_: string(nullable=true)
    # | -- currency_: string(nullable=true)

    # figure out schema on message 12, keep for re-use later as a technology demonstration
    msg12Schema = spark.read.json(
        cleanDf.filter(col("value").contains('"msgType_":12')).select(
            col("value").cast("string")).rdd.map(
                lambda r: r.value))._jdf.schema().toDDL()
    msg12Df = cleanDf.filter(col("value").contains('"msgType_":12')) \
        .withColumn("value", from_json("value", msg12Schema)) \
        .repartition(2, ["value.bookEntry_.securityId_"])

    # msg12Df.printSchema()
    # msg12Df.select("value.bookEntry_.side_").show()
    # root
    # | -- value: struct(nullable=true)
    # | | -- bookEntry_: struct(nullable=true)
    # | | | -- orderId_: long(nullable=true)
    # | | | -- price_: long(nullable=true)
    # | | | -- quantity_: long(nullable=true)
    # | | | -- securityId_: long(nullable=true)
    # | | | -- side_: string(nullable=true)
    # | | -- header: struct(nullable=true)
    # | | | -- length_: long(nullable=true)
    # | | | -- msgType_: long(nullable=true)
    # | | | -- seqNo_: long(nullable=true)

    # now aggregate messageType12 by securityId_ and side_
    aggDfSells = msg12Df.filter("value.bookEntry_.side_ == 'SELL'") \
        .select("*", (col("value.bookEntry_.quantity_") * col("value.bookEntry_.price_")).alias("TotalSellAmount")) \
        .groupby("value.bookEntry_.securityId_") \
        .agg(count("value.bookEntry_.securityId_").alias("Total Sell Count"),
             _sum("value.bookEntry_.quantity_").alias("Total Sell Quantity"),
             _min("value.bookEntry_.price_").alias("Min Sell Price"),
             _sum("TotalSellAmount").alias("Weighted Average Sell Price")
             ) \
        .withColumn("Weighted Average Sell Price", col("Weighted Average Sell Price") / col("Total Sell Quantity"))

    # now aggregate messageType12 by securityId_ and side_
    aggDfBuys = msg12Df.filter("value.bookEntry_.side_ == 'BUY'") \
        .select("*", (col("value.bookEntry_.quantity_") * col("value.bookEntry_.price_")).alias("TotalBuyAmount")) \
        .groupby("value.bookEntry_.securityId_") \
        .agg(count("value.bookEntry_.securityId_").alias("Total Buy Count"),
             _sum("value.bookEntry_.quantity_").alias("Total Buy Quantity"),
             _max("value.bookEntry_.price_").alias("Max Buy Price"),
             _sum("TotalBuyAmount").alias("Weighted Average Buy Price")) \
        .withColumn("Weighted Average Buy Price", col("Weighted Average Buy Price") / col("Total Buy Quantity"))

    # bring it together with joins, use outer join with the security data due to missing ids
    # select columns in the following order..
    outputColList = [
        col("isin_").alias("ISIN"),
        col("currency_").alias("Currency"), "Total Buy Count",
        "Total Sell Count", "Total Buy Quantity", "Total Sell Quantity",
        "Weighted Average Buy Price", "Weighted Average Sell Price",
        "Max Buy Price", "Min Sell Price"
    ]

    outputDf = aggDfBuys.join(aggDfSells, ["securityId_"], "full_outer") \
        .join(msg8Df, ["securityId_"], "left_outer") \
        .na.fill(0, outputColList[2:]) \
        .na.fill("MISSING", ["isin_", "currency_"]) \
        .select(outputColList)

    # collect into a single file
    outputDf.coalesce(1).write.option("sep", "\t").csv(targetTsvFile,
                                                       header=True)

    # Demo writing to postgresql (msg8 dataframe)
    # will append records to table AcquisExample. Table will
    # be created on the fly it it does not exist.
    dburl = getDbConnectionUrl(db=os.getenv("POSTGRES_DB"),
                               user=os.getenv("POSTGRES_USER"),
                               secret=os.getenv("POSTGRES_SECRET"))
    msg8Df.write.format("jdbc") \
        .option("url", dburl) \
        .option("dbtable", "AcquisExample") \
        .option("driver", "org.postgresql.Driver") \
        .save(mode="append")

    spark.stop()
    # remove useless data
    init_flat_data = init_flat_data.fillna(0)

    #cast columns
    init_flat_data = init_flat_data \
        .withColumn('order_date', to_date(unix_timestamp("order_date", "yyyy-MM-dd").cast('timestamp')) ) \
        .withColumn('NBI', col('NBI').cast('float') ) \
        .withColumn('order_id', col('order_id').cast('int')) \
        .withColumn('company_id', col('company_id').cast('int'))

    # print schema
    print(init_flat_data.printSchema(), '\n')

    # calculate min and max or order date in order to calculate recency
    max_order_date, min_order_date = init_flat_data \
        .select( _max(col('order_date')), _min(col('order_date'))) \
        .take(1)[0]

    # calculate recency/frequency and monetary
    calculate_diff_day = udf(lambda x: (max_order_date - x).days,
                             IntegerType())
    rfm_table = init_flat_data \
        .withColumn('recency', calculate_diff_day('order_date')) \
        .groupby(['company_id', 'company_name', 'country']) \
        .agg(
            _mean(col('recency')).alias('recency'),
            _count(col('order_id')).alias('frequency'),
            _sum(col('NBI')).alias('monetary')
        )

    # calculate quantiles for each variable
def main():
    """
    TODO: Create html page

    Access time filter logic:
        - If "last_access_ts" is less than 3 months ago, then set "months_old" as 3,
        - If "last_access_ts" is less than 6 monthsa ago, then set "months_old" as 6,
        - If "last_access_ts" is less than 12 months ago, then set "months_old" as 12

    The result includes only the datasets whose last access time are 12, 6 or 3 months ago.
    """
    spark = get_spark_session()
    (df_contents_f_to_b, df_contents_b_to_d, df_replicas, df_dids_files,
     df_replicas_j_dids, df_files_complete) = prepare_spark_dataframes(spark)

    # ===============================================================================
    # Continue with joins
    # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    # -------------------------------------------------------------------------------

    # --- STEP-10 / Tests to check dataframes are okay ---:
    #         df_block_file_rse.select("file").distinct().count() =  is 29921156
    #         df_block_file_rse.filter(col("file").isNull()).count() = 0
    #         df_block_file_rse.filter(col("block").isNull()).count() = 57892
    #         Above line means, we cannot extract block names of 57892 file from CONTENTS table ..
    #         .. which provides F:D and D:C mapping (file, dataset, container in Rucio terms)
    #         df_block_file_rse.filter(col("rse_id").isNull()).count() = 0
    #         df_block_file_rse.filter(col("fsize").isNull()).count() = 0
    #         We are all good, just drop null block names.

    # STEP-10: Left join df_files_complete and df_contents_f_to_b to get block names of files.
    #   - There are some files that we cannot extract their block names from CONTENTS table
    #   - So filter out them.
    df_block_file_rse = df_files_complete \
        .join(df_contents_f_to_b, ["file"], how="left") \
        .select(['block', 'file', 'rse_id', 'accessed_at', 'fsize', ]) \
        .filter(col("block").isNotNull()) \
        .cache()

    # --- STEP-11 / Tests to check dataframes are okay ---:
    #         df_all.filter(col("dataset").isNull()).count() = 280821

    # STEP-11: Left join df_block_file_rse and df_contents_b_to_d to get dataset names of blocks&files.
    #   - There are some blocks that we cannot extract their dataset names from CONTENTS table.
    #   - So filter out them.
    df_all = df_block_file_rse \
        .join(df_contents_b_to_d, ["block"], how="left") \
        .select(['dataset', 'block', 'file', 'rse_id', 'accessed_at', 'fsize']) \
        .filter(col("dataset").isNotNull()) \
        .cache()

    # STEP-12: Group by "dataset" and "rses" to calculate:
    #       - dataset_size_in_rse: total size of dataset in a RSE by summing up dataset's all files in that RSE.
    #       - `last_access_time_of_dataset_per_rse`: last access time of dataset in a RSE ...
    #           ... by getting max of file `accessed_at` field of dataset's all files in that RSE.
    #       - `#files_null_access_time_per_rse`: number of files which has NULL `accessed_at` field ...
    #           ... in each dataset in a RSE. ...
    #           ... This important to know to filter out if there is any NULL accessed_at file in calculation.
    #       - `#files_per_rse`: number of files od the dataset in that RSE
    #       - `#files_unique_per_rse`: unique count of dataset files in that RSE
    #       Final result will be like: one dataset can be in multiple RSEs and presumably ...
    #           ... it may have different sizes since a dataset may lost one of its block or file in a RSE?
    df_final_dataset_rse = df_all \
        .groupby(["dataset", "rse_id"]) \
        .agg(_sum(col("fsize")).alias("dataset_size_in_rse"),
             _max(col("accessed_at")).alias("last_access_time_of_dataset_per_rse"),
             _sum(when(col("accessed_at").isNull(), 1).otherwise(0)).alias("#files_null_access_time_per_rse"),
             _count(lit(1)).alias("#files_per_rse"),
             countDistinct(col("file")).alias("#files_unique_per_rse"),
             ) \
        .cache()

    # STEP-13: Get thresholds. They are unix timestamps which are 3, 6 and 12 months ago from today.
    ts_thresholds = get_ts_thresholds()

    # STEP-14:
    #   Filter for calculating last_accessed_at_least_{12|6|3}_months_ago columns.
    #       - To produce correct results, "last_access_time_of_dataset_per_rse" field should not be null
    #           which means a dataset's all files' accessed_at fields are filled.
    #       - And "#files_null_access_time_per_rse"==0 means that there should not be ...
    #           any file with NULL "accessed_at" field.
    # Group by dataset to get final result from all RSEs' datasets.
    #   - max_dataset_size(TB): max size of dataset in all RSEs that contain this dataset
    #   - max_dataset_size(TB): min size of dataset in all RSEs that contain this dataset
    #   - max_dataset_size(TB): avg size of dataset in all RSEs that contain this dataset
    #   - last_access_time_of_dataset: last access time of dataset in all RSEs
    df = df_final_dataset_rse \
        .filter(col("last_access_time_of_dataset_per_rse").isNotNull() &
                (col("#files_null_access_time_per_rse") == 0)
                ) \
        .groupby(["dataset"]) \
        .agg(_round(_max(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("max_dataset_size(TB)"),
             _round(_min(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("min_dataset_size(TB)"),
             _round(_avg(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("avg_dataset_size(TB)"),
             _sum(col("#files_null_access_time_per_rse")).alias("#files_null_access_time_per_dataset"),
             _max(col("last_access_time_of_dataset_per_rse")).alias("last_access_time_of_dataset"),
             ) \
        .withColumn('last_access_more_than_12_months_ago',
                    when(col('last_access_time_of_dataset') < ts_thresholds[12], 1).otherwise(0)
                    ) \
        .withColumn('last_access_more_than_6_months_ago',
                    when(col('last_access_time_of_dataset') < ts_thresholds[6], 1).otherwise(0)
                    ) \
        .withColumn('last_access_more_than_3_months_ago',
                    when(col('last_access_time_of_dataset') < ts_thresholds[3], 1).otherwise(0)
                    ) \
        .filter((col('last_access_more_than_12_months_ago') == 1) |
                (col('last_access_more_than_6_months_ago') == 1) |
                (col('last_access_more_than_3_months_ago') == 1)
                ) \
        .cache()

    # STEP-15: Find datasets which have only null accessed_at fields in its files
    df_all_null_accessed_at = df_final_dataset_rse \
        .filter(col("last_access_time_of_dataset_per_rse").isNull()) \
        .groupby(["dataset"]) \
        .agg(_round(_max(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("max_dataset_size(TB)"),
             _round(_min(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("min_dataset_size(TB)"),
             _round(_avg(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("avg_dataset_size(TB)"),
             _sum(col("#files_null_access_time_per_rse")).alias("#files_null_access_time_per_dataset"),
             _max(col("last_access_time_of_dataset_per_rse")).alias("last_access_time_of_dataset"),
             ) \
        .cache()

    # Total for not null data: not read more than 3,6,12 months which is equal to more than 3 months values.
    df.select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()

    # For 12 months
    df.filter(col("last_access_more_than_12_months_ago") == 1).select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df.filter(col("last_access_more_than_12_months_ago") == 1).count())

    # For 6 months
    df.filter(col("last_access_more_than_6_months_ago") == 1).select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df.filter(col("last_access_more_than_6_months_ago") == 1).count())

    # For 3 months
    df.filter(col("last_access_more_than_3_months_ago") == 1).select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df.filter(col("last_access_more_than_3_months_ago") == 1).count())

    # For all null accessed_at(all files) datasets
    df_all_null_accessed_at.select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df_all_null_accessed_at.count())

    return df, df_all_null_accessed_at