def get_df_main_datasets_never_read(df_dataset_file_rse_ts_size,
                                    filtered_rses_id_name_map, min_tb_limit):
    """Get never accessed datasets' dataframes"""
    reverted_filtered_rses_id_name_map = get_reverted_rses_id_name_map(
        filtered_rses_id_name_map)
    df_sub_datasets_never_read = df_dataset_file_rse_ts_size \
        .groupby(['rse_id', 'dataset']) \
        .agg(_round(_sum(col('f_size')) / TB_DENOMINATOR, 5).alias('dataset_size_in_rse_tb'),
             _max(col('accessed_at')).alias('last_access_time_of_dataset_in_rse'),
             _max(col('created_at')).alias('last_create_time_of_dataset_in_rse'),
             _count(lit(1)).alias('#_files_of_dataset_in_rse'),
             ) \
        .filter(col('last_access_time_of_dataset_in_rse').isNull()) \
        .filter(col('dataset_size_in_rse_tb') > min_tb_limit) \
        .replace(reverted_filtered_rses_id_name_map, subset=['rse_id']) \
        .withColumnRenamed('rse_id', 'RSE name') \
        .select(['RSE name',
                 'dataset',
                 'dataset_size_in_rse_tb',
                 'last_create_time_of_dataset_in_rse',
                 '#_files_of_dataset_in_rse'
                 ]) \
        .cache()

    df_main_datasets_never_read = df_sub_datasets_never_read \
        .groupby(['dataset']) \
        .agg(_max(col('dataset_size_in_rse_tb')).alias('max_dataset_size_in_rses(TB)'),
             _min(col('dataset_size_in_rse_tb')).alias('min_dataset_size_in_rses(TB)'),
             _avg(col('dataset_size_in_rse_tb')).alias('avg_dataset_size_in_rses(TB)'),
             _sum(col('dataset_size_in_rse_tb')).alias('sum_dataset_size_in_rses(TB)'),
             _max(col('last_create_time_of_dataset_in_rse')).alias('last_create_time_of_dataset_in_all_rses'),
             concat_ws(', ', collect_list('RSE name')).alias('RSE(s)'),
             ) \
        .cache()
    return df_main_datasets_never_read, df_sub_datasets_never_read
    print(init_flat_data.printSchema(), '\n')

    # calculate min and max or order date in order to calculate recency
    max_order_date, min_order_date = init_flat_data \
        .select( _max(col('order_date')), _min(col('order_date'))) \
        .take(1)[0]

    # calculate recency/frequency and monetary
    calculate_diff_day = udf(lambda x: (max_order_date - x).days,
                             IntegerType())
    rfm_table = init_flat_data \
        .withColumn('recency', calculate_diff_day('order_date')) \
        .groupby(['company_id', 'company_name', 'country']) \
        .agg(
            _mean(col('recency')).alias('recency'),
            _count(col('order_id')).alias('frequency'),
            _sum(col('NBI')).alias('monetary')
        )

    # calculate quantiles for each variable
    quantiles = rfm_table.approxQuantile(['recency', 'frequency', 'monetary'],
                                         [0.20, 0.4, 0.6, 0.8], 0)
    r_quantile = quantiles[0]
    f_quantile = quantiles[1]
    m_quantile = quantiles[2]

    # calculate score of each variable
    def_r_score = udf(
        lambda x: 5 if x < r_quantile[0] else 4 if x < r_quantile[1] else 3
        if x < r_quantile[2] else 2 if x < r_quantile[3] else 1, IntegerType())
    def_f_score = udf(
def get_df_sub_not_read_since(df_dataset_file_rse_ts_size,
                              filtered_rses_id_name_map, min_tb_limit,
                              n_months_filter):
    """Get dataframe of datasets that are not read since N months for sub details htmls

    Group by 'dataset' and 'rse_id' of get_df_dataset_file_rse_ts_size

    Filters:
        - If a dataset contains EVEN a single file with null accessed_at, filter out

    Access time filter logic:
        - If 'last_access_time_of_dataset_in_all_rses' is less than 'n_months_filter', ...
          ... set 'is_not_read_since_{n_months_filter}_months' column as True

    Columns:
        - 'dataset_size_in_rse_tb'
                Total size of a Dataset in an RSE.
                Produced by summing up datasets' all files in that RSE.
        - 'last_access_time_of_dataset_in_rse'
                Last access time of a Dataset in an RSE.
                Produced by getting max `accessed_at`(represents single file's access time) of a dataset in an RSE.
        - '#files_with_null_access_time_of_dataset_in_rse'
                Number of files count, which have NULL `accessed_at` values, of a Dataset in an RSE.
                This is important to know to filter out if there is any NULL `accessed_at` value of a Dataset.
        - '#files_of_dataset_in_rse'
                Number of files count of a Dataset in an RSE
        - '#distinct_files_of_dataset_in_rse'
                Number of unique files count of dataset in an RSE

    df_main_datasets_and_rses: RSE name, dataset and their size and access time calculations
    """
    # New boolean column name to map dataset-rse_id couples are not read at least n_months_filter or not
    bool_column_is_not_read_since_n_months = 'is_not_read_since_{}_months'.format(
        str(n_months_filter))

    # Get reverted dict to get RSE names from id
    reverted_filtered_rses_id_name_map = get_reverted_rses_id_name_map(
        filtered_rses_id_name_map)

    return df_dataset_file_rse_ts_size \
        .groupby(['rse_id', 'dataset']) \
        .agg(_round(_sum(col('f_size')) / TB_DENOMINATOR, 5).alias('dataset_size_in_rse_tb'),
             _max(col('accessed_at')).alias('last_access_time_of_dataset_in_rse'),
             _sum(
                 when(col('accessed_at').isNull(), 0).otherwise(1)
             ).alias('#_accessed_files_of_dataset_in_rse'),
             _count(lit(1)).alias('#_files_of_dataset_in_rse'),
             ) \
        .withColumn(bool_column_is_not_read_since_n_months,
                    when(
                        col('last_access_time_of_dataset_in_rse') < get_n_months_ago_epoch_msec(n_months_filter),
                        True).otherwise(False)
                    ) \
        .filter(col('last_access_time_of_dataset_in_rse').isNotNull()) \
        .filter(col(bool_column_is_not_read_since_n_months)) \
        .filter(col('dataset_size_in_rse_tb') > min_tb_limit) \
        .replace(reverted_filtered_rses_id_name_map, subset=['rse_id']) \
        .withColumnRenamed('rse_id', 'RSE name') \
        .select(['RSE name',
                 'dataset',
                 'dataset_size_in_rse_tb',
                 'last_access_time_of_dataset_in_rse',
                 '#_files_of_dataset_in_rse',
                 '#_accessed_files_of_dataset_in_rse',
                 ]) \
        .cache()
def main():
    """
    TODO: Create html page

    Access time filter logic:
        - If "last_access_ts" is less than 3 months ago, then set "months_old" as 3,
        - If "last_access_ts" is less than 6 monthsa ago, then set "months_old" as 6,
        - If "last_access_ts" is less than 12 months ago, then set "months_old" as 12

    The result includes only the datasets whose last access time are 12, 6 or 3 months ago.
    """
    spark = get_spark_session()
    (df_contents_f_to_b, df_contents_b_to_d, df_replicas, df_dids_files,
     df_replicas_j_dids, df_files_complete) = prepare_spark_dataframes(spark)

    # ===============================================================================
    # Continue with joins
    # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    # -------------------------------------------------------------------------------

    # --- STEP-10 / Tests to check dataframes are okay ---:
    #         df_block_file_rse.select("file").distinct().count() =  is 29921156
    #         df_block_file_rse.filter(col("file").isNull()).count() = 0
    #         df_block_file_rse.filter(col("block").isNull()).count() = 57892
    #         Above line means, we cannot extract block names of 57892 file from CONTENTS table ..
    #         .. which provides F:D and D:C mapping (file, dataset, container in Rucio terms)
    #         df_block_file_rse.filter(col("rse_id").isNull()).count() = 0
    #         df_block_file_rse.filter(col("fsize").isNull()).count() = 0
    #         We are all good, just drop null block names.

    # STEP-10: Left join df_files_complete and df_contents_f_to_b to get block names of files.
    #   - There are some files that we cannot extract their block names from CONTENTS table
    #   - So filter out them.
    df_block_file_rse = df_files_complete \
        .join(df_contents_f_to_b, ["file"], how="left") \
        .select(['block', 'file', 'rse_id', 'accessed_at', 'fsize', ]) \
        .filter(col("block").isNotNull()) \
        .cache()

    # --- STEP-11 / Tests to check dataframes are okay ---:
    #         df_all.filter(col("dataset").isNull()).count() = 280821

    # STEP-11: Left join df_block_file_rse and df_contents_b_to_d to get dataset names of blocks&files.
    #   - There are some blocks that we cannot extract their dataset names from CONTENTS table.
    #   - So filter out them.
    df_all = df_block_file_rse \
        .join(df_contents_b_to_d, ["block"], how="left") \
        .select(['dataset', 'block', 'file', 'rse_id', 'accessed_at', 'fsize']) \
        .filter(col("dataset").isNotNull()) \
        .cache()

    # STEP-12: Group by "dataset" and "rses" to calculate:
    #       - dataset_size_in_rse: total size of dataset in a RSE by summing up dataset's all files in that RSE.
    #       - `last_access_time_of_dataset_per_rse`: last access time of dataset in a RSE ...
    #           ... by getting max of file `accessed_at` field of dataset's all files in that RSE.
    #       - `#files_null_access_time_per_rse`: number of files which has NULL `accessed_at` field ...
    #           ... in each dataset in a RSE. ...
    #           ... This important to know to filter out if there is any NULL accessed_at file in calculation.
    #       - `#files_per_rse`: number of files od the dataset in that RSE
    #       - `#files_unique_per_rse`: unique count of dataset files in that RSE
    #       Final result will be like: one dataset can be in multiple RSEs and presumably ...
    #           ... it may have different sizes since a dataset may lost one of its block or file in a RSE?
    df_final_dataset_rse = df_all \
        .groupby(["dataset", "rse_id"]) \
        .agg(_sum(col("fsize")).alias("dataset_size_in_rse"),
             _max(col("accessed_at")).alias("last_access_time_of_dataset_per_rse"),
             _sum(when(col("accessed_at").isNull(), 1).otherwise(0)).alias("#files_null_access_time_per_rse"),
             _count(lit(1)).alias("#files_per_rse"),
             countDistinct(col("file")).alias("#files_unique_per_rse"),
             ) \
        .cache()

    # STEP-13: Get thresholds. They are unix timestamps which are 3, 6 and 12 months ago from today.
    ts_thresholds = get_ts_thresholds()

    # STEP-14:
    #   Filter for calculating last_accessed_at_least_{12|6|3}_months_ago columns.
    #       - To produce correct results, "last_access_time_of_dataset_per_rse" field should not be null
    #           which means a dataset's all files' accessed_at fields are filled.
    #       - And "#files_null_access_time_per_rse"==0 means that there should not be ...
    #           any file with NULL "accessed_at" field.
    # Group by dataset to get final result from all RSEs' datasets.
    #   - max_dataset_size(TB): max size of dataset in all RSEs that contain this dataset
    #   - max_dataset_size(TB): min size of dataset in all RSEs that contain this dataset
    #   - max_dataset_size(TB): avg size of dataset in all RSEs that contain this dataset
    #   - last_access_time_of_dataset: last access time of dataset in all RSEs
    df = df_final_dataset_rse \
        .filter(col("last_access_time_of_dataset_per_rse").isNotNull() &
                (col("#files_null_access_time_per_rse") == 0)
                ) \
        .groupby(["dataset"]) \
        .agg(_round(_max(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("max_dataset_size(TB)"),
             _round(_min(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("min_dataset_size(TB)"),
             _round(_avg(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("avg_dataset_size(TB)"),
             _sum(col("#files_null_access_time_per_rse")).alias("#files_null_access_time_per_dataset"),
             _max(col("last_access_time_of_dataset_per_rse")).alias("last_access_time_of_dataset"),
             ) \
        .withColumn('last_access_more_than_12_months_ago',
                    when(col('last_access_time_of_dataset') < ts_thresholds[12], 1).otherwise(0)
                    ) \
        .withColumn('last_access_more_than_6_months_ago',
                    when(col('last_access_time_of_dataset') < ts_thresholds[6], 1).otherwise(0)
                    ) \
        .withColumn('last_access_more_than_3_months_ago',
                    when(col('last_access_time_of_dataset') < ts_thresholds[3], 1).otherwise(0)
                    ) \
        .filter((col('last_access_more_than_12_months_ago') == 1) |
                (col('last_access_more_than_6_months_ago') == 1) |
                (col('last_access_more_than_3_months_ago') == 1)
                ) \
        .cache()

    # STEP-15: Find datasets which have only null accessed_at fields in its files
    df_all_null_accessed_at = df_final_dataset_rse \
        .filter(col("last_access_time_of_dataset_per_rse").isNull()) \
        .groupby(["dataset"]) \
        .agg(_round(_max(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("max_dataset_size(TB)"),
             _round(_min(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("min_dataset_size(TB)"),
             _round(_avg(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("avg_dataset_size(TB)"),
             _sum(col("#files_null_access_time_per_rse")).alias("#files_null_access_time_per_dataset"),
             _max(col("last_access_time_of_dataset_per_rse")).alias("last_access_time_of_dataset"),
             ) \
        .cache()

    # Total for not null data: not read more than 3,6,12 months which is equal to more than 3 months values.
    df.select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()

    # For 12 months
    df.filter(col("last_access_more_than_12_months_ago") == 1).select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df.filter(col("last_access_more_than_12_months_ago") == 1).count())

    # For 6 months
    df.filter(col("last_access_more_than_6_months_ago") == 1).select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df.filter(col("last_access_more_than_6_months_ago") == 1).count())

    # For 3 months
    df.filter(col("last_access_more_than_3_months_ago") == 1).select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df.filter(col("last_access_more_than_3_months_ago") == 1).count())

    # For all null accessed_at(all files) datasets
    df_all_null_accessed_at.select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df_all_null_accessed_at.count())

    return df, df_all_null_accessed_at
Exemplo n.º 5
0
    schema = StructType() \
        .add("sentence", StringType()) \
        .add("candidate", StringType()) 
     

    df2 = df1\
        .select(from_json(col("value"), schema).alias("transaction_detail"), "topic", "partition", "timestamp")

    df2.createOrReplaceTempView("main_table")

    df3 = spark.sql("select transaction_detail.*, topic, partition, timestamp from main_table ")

    df3 = df3.repartitionByRange(2, col("candidate"))

    print("-------- print the length of each partition  -------------------")
    df3_partitioning_details = df3.withColumn("partition_id", spark_partition_id()).groupBy("partition_id").agg(_count("candidate"))
    

    print_partitioning_details = df3_partitioning_details \
    .writeStream \
        .trigger(processingTime='15 seconds') \
    .outputMode("update") \
    .option("truncate", "false")\
    .format("console") \
    .start()

    df4 = df3     
    print("Printing Schema of df4: ")
    df4.printSchema()

    def cleanTxt(text):
Exemplo n.º 6
0
def create_main_df(spark, hdfs_paths, base_eos_dir):
    # UTC timestamp of start hour of spark job
    ts_current_hour = int(datetime.utcnow().replace(
        minute=0, second=0, microsecond=0, tzinfo=timezone.utc).timestamp() *
                          1000)
    # -----------------------------------------------------------------------------------------------------------------
    #                -- ==================  Prepare main Spark dataframes  ===========================

    # Get RSES id, name, type, tier, country, kind from RSES table dump
    df_rses = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['RSES']) \
        .filter(col('DELETED_AT').isNull()) \
        .withColumn('replica_rse_id', lower(_hex(col('ID')))) \
        .withColumnRenamed('RSE', 'rse') \
        .withColumnRenamed('RSE_TYPE', 'rse_type') \
        .withColumn('rse_tier', _split(col('rse'), '_').getItem(0)) \
        .withColumn('rse_country', _split(col('rse'), '_').getItem(1)) \
        .withColumn('rse_kind',
                    when(col("rse").endswith('Temp'), 'temp')
                    .when(col("rse").endswith('Test'), 'test')
                    .otherwise('prod')
                    ) \
        .select(['replica_rse_id', 'rse', 'rse_type', 'rse_tier', 'rse_country', 'rse_kind'])

    # Rucio Dataset(D) refers to dbs block, so we used DBS terminology from the beginning
    df_contents_f_to_b = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['CONTENTS']) \
        .filter(col("SCOPE") == "cms") \
        .filter(col("DID_TYPE") == "D") \
        .filter(col("CHILD_TYPE") == "F") \
        .withColumnRenamed("NAME", "block") \
        .withColumnRenamed("CHILD_NAME", "file") \
        .select(["block", "file"])

    # Rucio Dataset(D) refers to dbs block; Rucio Container(C) refers to dbs dataset.
    # We used DBS terminology from the beginning
    df_contents_b_to_d = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['CONTENTS']) \
        .filter(col("SCOPE") == "cms") \
        .filter(col("DID_TYPE") == "C") \
        .filter(col("CHILD_TYPE") == "D") \
        .withColumnRenamed("NAME", "dataset") \
        .withColumnRenamed("CHILD_NAME", "block") \
        .select(["dataset", "block"])

    # Get file to dataset map
    df_contents_ds_files = df_contents_f_to_b.join(df_contents_b_to_d, ["block"], how="left") \
        .filter(col('file').isNotNull()) \
        .filter(col('dataset').isNotNull()) \
        .withColumnRenamed('dataset', 'contents_dataset') \
        .withColumn('is_d_name_from_rucio', lit(BOOL_STR[True])) \
        .select(["contents_dataset", "file", "is_d_name_from_rucio"])

    dbs_files = spark.read.format('avro').load(hdfs_paths['FILES']) \
        .withColumnRenamed('LOGICAL_FILE_NAME', 'file') \
        .withColumnRenamed('DATASET_ID', 'dbs_file_ds_id') \
        .withColumnRenamed('FILE_SIZE', 'dbs_file_size') \
        .select(['file', 'dbs_file_ds_id', 'dbs_file_size'])

    dbs_datasets = spark.read.format('avro').load(hdfs_paths['DATASETS'])

    df_dbs_ds_files = dbs_files.join(dbs_datasets.select(['DATASET_ID', 'DATASET']),
                                     dbs_files.dbs_file_ds_id == dbs_datasets.DATASET_ID, how='left') \
        .filter(col('file').isNotNull()) \
        .filter(col('DATASET').isNotNull()) \
        .withColumnRenamed('dbs_file_ds_id', 'dbs_dataset_id') \
        .withColumnRenamed('DATASET', 'dbs_dataset') \
        .withColumn('is_d_name_from_dbs', lit(BOOL_STR[True])) \
        .select(['file', 'dbs_dataset', 'is_d_name_from_dbs'])

    # Prepare replicas
    df_replicas = spark.read.format('avro').load(hdfs_paths['REPLICAS']) \
        .filter(col("SCOPE") == "cms") \
        .withColumn('replica_rse_id', lower(_hex(col('RSE_ID')))) \
        .withColumn('replica_file_size', col('BYTES').cast(LongType())) \
        .withColumnRenamed('NAME', 'file') \
        .withColumnRenamed('ACCESSED_AT', 'replica_accessed_at') \
        .withColumnRenamed('CREATED_AT', 'replica_created_at') \
        .withColumnRenamed('LOCK_CNT', 'lock_cnt') \
        .withColumnRenamed('STATE', 'state') \
        .select(['file', 'replica_rse_id', 'replica_file_size',
                 'replica_accessed_at', 'replica_created_at', 'lock_cnt'])

    # Create enriched file df which adds dbs file size to replicas files. Left join select only replicas files
    df_files_enriched_with_dbs = df_replicas \
        .join(dbs_files.select(['file', 'dbs_file_size']), ['file'], how='left') \
        .withColumn('joint_file_size',
                    when(col('replica_file_size').isNotNull(), col('replica_file_size'))
                    .when(col('dbs_file_size').isNotNull(), col('dbs_file_size'))
                    ) \
        .select(['file', 'replica_rse_id', 'replica_accessed_at', 'replica_created_at', 'lock_cnt',
                 'replica_file_size', 'dbs_file_size', 'joint_file_size'])

    # -----------------------------------------------------------------------------------------------------------------
    #            -- ==================  only Rucio: Replicas and Contents  ======================= --

    df_only_from_rucio = df_replicas \
        .join(df_contents_ds_files, ['file'], how='left') \
        .select(['contents_dataset', 'file', 'replica_rse_id', 'replica_file_size',
                 'replica_accessed_at', 'replica_created_at', 'is_d_name_from_rucio', 'lock_cnt'])

    # Use them in outer join
    # _max(col('replica_accessed_at')).alias('rucio_last_accessed_at'),
    # _max(col('replica_created_at')).alias('rucio_last_created_at'),

    df_only_from_rucio = df_only_from_rucio \
        .groupby(['replica_rse_id', 'contents_dataset']) \
        .agg(_sum(col('replica_file_size')).alias('rucio_size'),
             _count(lit(1)).alias('rucio_n_files'),
             _sum(
                 when(col('replica_accessed_at').isNull(), 0)
                     .otherwise(1)
             ).alias('rucio_n_accessed_files'),
             _first(col("is_d_name_from_rucio")).alias("is_d_name_from_rucio"),
             _sum(col('lock_cnt')).alias('rucio_locked_files')
             ) \
        .withColumn('rucio_is_d_locked',
                    when(col('rucio_locked_files') > 0, IS_DATASET_LOCKED[True])
                    .otherwise(IS_DATASET_LOCKED[False])
                    ) \
        .select(['contents_dataset', 'replica_rse_id', 'rucio_size', 'rucio_n_files', 'rucio_n_accessed_files',
                 'is_d_name_from_rucio', 'rucio_locked_files', 'rucio_is_d_locked', ])

    # -----------------------------------------------------------------------------------------------------------------
    #             -- =================  only DBS: Replicas, Files, Datasets  ====================== --

    # Of course only files from Replicas processed, select only dbs related fields
    df_only_from_dbs = df_files_enriched_with_dbs \
        .select(['file', 'replica_rse_id', 'dbs_file_size', 'replica_accessed_at', 'lock_cnt']) \
        .join(df_dbs_ds_files, ['file'], how='left') \
        .filter(col('dbs_dataset').isNotNull()) \
        .select(['file', 'dbs_dataset', 'replica_rse_id', 'dbs_file_size', 'replica_accessed_at',
                 'is_d_name_from_dbs', 'lock_cnt'])

    df_only_from_dbs = df_only_from_dbs \
        .groupby(['replica_rse_id', 'dbs_dataset']) \
        .agg(_sum(col('dbs_file_size')).alias('dbs_size'),
             _count(lit(1)).alias('dbs_n_files'),
             _sum(
                 when(col('replica_accessed_at').isNull(), 0)
                     .otherwise(1)
             ).alias('dbs_n_accessed_files'),
             _first(col("is_d_name_from_dbs")).alias("is_d_name_from_dbs"),
             _sum(col('lock_cnt')).alias('dbs_locked_files')
             ) \
        .withColumn('dbs_is_d_locked',
                    when(col('dbs_locked_files') > 0, IS_DATASET_LOCKED[True])
                    .otherwise(IS_DATASET_LOCKED[False])
                    ) \
        .select(['dbs_dataset', 'replica_rse_id', 'dbs_size', 'dbs_n_files', 'dbs_n_accessed_files',
                 'is_d_name_from_dbs', 'dbs_locked_files', 'dbs_is_d_locked'])

    # Full outer join of Rucio and DBS to get all dataset-file maps
    df_dataset_file_map_enr = df_contents_ds_files.join(df_dbs_ds_files,
                                                        ['file'],
                                                        how='full')

    # -----------------------------------------------------------------------------------------------------------------
    #               -- ======  check files do not have dataset name  ============ --

    # Check Replicas files do not have dataset name in Contents, DBS or both
    x = df_replicas.join(df_dataset_file_map_enr, ['file'], how='left') \
        .select(['contents_dataset', 'dbs_dataset', 'file'])

    y_contents = x.filter(col('contents_dataset').isNull())
    z_dbs = x.filter(col('dbs_dataset').isNull())
    t_both = x.filter(
        col('contents_dataset').isNull() & col('dbs_dataset').isNull())
    stats_dict = {
        "Replicas files do not have dataset name in Contents":
        y_contents.select('file').distinct().count(),
        "Replicas files do not have dataset name in DBS":
        z_dbs.select('file').distinct().count(),
        "Replicas files do not have dataset name neither in Contents nor DBS":
        t_both.select('file').distinct().count()
    }
    write_stats_to_eos(base_eos_dir, stats_dict)
    del x, y_contents, z_dbs, t_both

    # -----------------------------------------------------------------------------------------------------------------
    #              -- ======  joint Rucio and DBS: Replicas, Contents, Files, Datasets  ============ --

    # Main aim is to get all datasets of files
    df_dataset_file_map_enr = df_dataset_file_map_enr \
        .withColumn("dataset",
                    when(col("contents_dataset").isNotNull(), col("contents_dataset"))
                    .when(col("dbs_dataset").isNotNull(), col("dbs_dataset"))
                    ) \
        .withColumn("is_ds_from_rucio", when(col("is_d_name_from_rucio").isNotNull(), 1).otherwise(0)) \
        .withColumn("is_ds_from_dbs", when(col("is_d_name_from_dbs").isNotNull(), 1).otherwise(0)) \
        .select(['dataset', 'file', 'is_ds_from_dbs', 'is_ds_from_rucio'])

    df_joint_ds_files = df_files_enriched_with_dbs \
        .select(['file', 'replica_rse_id', 'replica_accessed_at', 'replica_created_at',
                 'joint_file_size', 'lock_cnt']) \
        .join(df_dataset_file_map_enr, ['file'], how='left') \
        .filter(col('dataset').isNotNull()) \
        .select(['dataset', 'file', 'is_ds_from_dbs', 'is_ds_from_rucio',
                 'replica_rse_id', 'replica_accessed_at', 'replica_created_at', 'joint_file_size', 'lock_cnt'])

    df_joint_main = df_joint_ds_files \
        .groupby(['replica_rse_id', 'dataset']) \
        .agg(_sum(col('joint_file_size')).alias('joint_size'),
             _max(col('replica_accessed_at')).alias('joint_last_accessed_at'),
             _max(col('replica_created_at')).alias('joint_last_created_at'),
             _sum(col('is_ds_from_dbs')).alias('joint_dbs_n_files'),
             _sum(col('is_ds_from_rucio')).alias('joint_rucio_n_files'),
             _count(lit(1)).alias('joint_n_files'),
             _sum(
                 when(col('replica_accessed_at').isNull(), 0).otherwise(1)
             ).alias('joint_n_accessed_files'),
             _sum(col('lock_cnt')).alias('joint_locked_files')
             ) \
        .withColumn('all_f_in_dbs',
                    when((col('joint_dbs_n_files') == 0) & (col('joint_dbs_n_files').isNull()),
                         IS_ALL_DATASET_FILES_EXISTS['n'])
                    .when(col('joint_dbs_n_files') == col('joint_n_files'), IS_ALL_DATASET_FILES_EXISTS['a'])
                    .when(col('joint_dbs_n_files') > 0, IS_ALL_DATASET_FILES_EXISTS['p'])
                    ) \
        .withColumn('all_f_in_rucio',
                    when((col('joint_rucio_n_files') == 0) & (col('joint_rucio_n_files').isNull()),
                         IS_ALL_DATASET_FILES_EXISTS['n'])
                    .when(col('joint_rucio_n_files') == col('joint_n_files'), IS_ALL_DATASET_FILES_EXISTS['a'])
                    .when(col('joint_rucio_n_files') > 0, IS_ALL_DATASET_FILES_EXISTS['p'])
                    ) \
        .withColumn('joint_is_d_locked',
                    when(col('joint_locked_files') > 0, IS_DATASET_LOCKED[True])
                    .otherwise(IS_DATASET_LOCKED[False])
                    ) \
        .withColumnRenamed("replica_rse_id", "rse_id") \
        .select(['dataset',
                 'rse_id',
                 'joint_size',
                 'joint_last_accessed_at',
                 'joint_last_created_at',
                 'joint_dbs_n_files',
                 'joint_rucio_n_files',
                 'joint_n_files',
                 'joint_n_accessed_files',
                 'all_f_in_dbs',
                 'all_f_in_rucio',
                 'joint_locked_files',
                 'joint_is_d_locked'
                 ])
    # -----------------------------------------------------------------------------------------------------------------
    #          -- ============  Dataset enrichment with Dataset tags  ============ --

    # Enrich dbs dataset with names from id properties of other tables
    dbs_data_tiers = spark.read.format('avro').load(hdfs_paths['DATA_TIERS'])
    dbs_physics_group = spark.read.format('avro').load(
        hdfs_paths['PHYSICS_GROUPS'])
    dbs_acquisition_era = spark.read.format('avro').load(
        hdfs_paths['ACQUISITION_ERAS'])
    dbs_dataset_access_type = spark.read.format('avro').load(
        hdfs_paths['DATASET_ACCESS_TYPES'])

    dbs_datasets_enr = dbs_datasets \
        .join(dbs_data_tiers, ['data_tier_id'], how='left') \
        .join(dbs_physics_group, ['physics_group_id'], how='left') \
        .join(dbs_acquisition_era, ['acquisition_era_id'], how='left') \
        .join(dbs_dataset_access_type, ['dataset_access_type_id'], how='left') \
        .select(['dataset', 'dataset_id', 'is_dataset_valid', 'primary_ds_id', 'processed_ds_id', 'prep_id',
                 'data_tier_id', 'data_tier_name',
                 'physics_group_id', 'physics_group_name',
                 'acquisition_era_id', 'acquisition_era_name',
                 'dataset_access_type_id', 'dataset_access_type'])

    # -----------------------------------------------------------------------------------------------------------------
    #                       -- ============  Main: join all  ============ --

    cond_with_only_rucio = [
        df_joint_main.dataset == df_only_from_rucio.contents_dataset,
        df_joint_main.rse_id == df_only_from_rucio.replica_rse_id
    ]

    cond_with_only_dbs = [
        df_joint_main.dataset == df_only_from_dbs.dbs_dataset,
        df_joint_main.rse_id == df_only_from_dbs.replica_rse_id
    ]

    # Left joins: since df_join_main has outer join, should have all datasets of both Rucio and DBS
    df_main = df_joint_main.join(df_only_from_rucio,
                                 cond_with_only_rucio,
                                 how='left').drop('replica_rse_id')
    df_main = df_main.join(df_only_from_dbs, cond_with_only_dbs,
                           how='left').drop('replica_rse_id')

    df_main = df_main \
        .withColumn('rucio_has_ds_name',
                    when(col('is_d_name_from_rucio').isNotNull(), col('is_d_name_from_rucio'))
                    .otherwise(BOOL_STR[False])) \
        .withColumn('dbs_has_ds_name',
                    when(col('is_d_name_from_dbs').isNotNull(), col('is_d_name_from_dbs'))
                    .otherwise(BOOL_STR[False]))

    # Remove unneeded columns by selecting specific ones
    df_main = df_main.select([
        'dataset', 'rse_id', 'joint_size', 'joint_last_accessed_at',
        'joint_last_created_at', 'joint_dbs_n_files', 'joint_rucio_n_files',
        'joint_n_files', 'joint_n_accessed_files', 'all_f_in_dbs',
        'all_f_in_rucio', 'rucio_size', 'rucio_n_files',
        'rucio_n_accessed_files', 'rucio_has_ds_name', 'dbs_size',
        'dbs_n_files', 'dbs_n_accessed_files', 'dbs_has_ds_name',
        'rucio_locked_files', 'rucio_is_d_locked', 'dbs_locked_files',
        'dbs_is_d_locked', 'joint_locked_files', 'joint_is_d_locked'
    ])

    # Add DBS dataset enrichment's to main df
    df_main = df_main.join(dbs_datasets_enr, ['dataset'], how='left')

    # Add RSES name, type, tier, country, kind to dataset
    df_main = df_main \
        .join(df_rses, df_main.rse_id == df_rses.replica_rse_id, how='left') \
        .drop('rse_id', 'replica_rse_id')

    # UTC timestamp of start hour of the spark job
    df_main = df_main.withColumn('tstamp_hour', lit(ts_current_hour))

    # Fill null values of string type columns. Null values is hard to handle in ES queries.
    df_main = df_main.fillna(value=NULL_STR_TYPE_COLUMN_VALUE,
                             subset=STR_TYPE_COLUMNS)
    return df_main
Exemplo n.º 7
0
def main(
    output_folder="./www/stepchain",
    start_date=None,
    end_date=None,
    last_n_days=15,
):
    """Get step data in wmarchive.

    Each step array contains multiple steps. Udf function returns each step as a separate row in a list.
    flatMap helps to flat list of steps to become individual rows in dataframe.
    """
    # Borrowed logic from condor_cpu_efficiency
    _yesterday = datetime.combine(date.today() - timedelta(days=1),
                                  datetime.min.time())
    if not (start_date or end_date):
        end_date = _yesterday
        start_date = end_date - timedelta(days=last_n_days)
    elif not start_date:
        start_date = end_date - timedelta(days=last_n_days)
    elif not end_date:
        end_date = min(start_date + timedelta(days=last_n_days), _yesterday)
    if start_date > end_date:
        raise ValueError(
            f"start date ({start_date}) should be earlier than end date({end_date})"
        )

    spark = get_spark_session()
    df_raw = spark.read.option("basePath", _DEFAULT_HDFS_FOLDER).json(
        get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER)
    ) \
        .select(["data.*", "metadata.timestamp"]) \
        .filter(
        f"""data.meta_data.jobstate='success'
                  AND data.meta_data.jobtype='Production'
                  AND data.wmats >= {start_date.timestamp()}
                  AND data.wmats < {end_date.timestamp()}
                  """
    )
    df_rdd = df_raw.rdd.flatMap(lambda r: udf_step_extract(r))
    df = spark.createDataFrame(df_rdd,
                               schema=get_schema()).dropDuplicates().where(
                                   _col("ncores").isNotNull()).cache()
    df_details = df.groupby(["task", "site", "step_name"]).agg(
        (100 * (_sum("jobCPU") / _mean("nthreads")) /
         _sum("jobTime")).alias("avg_cpueff"),
        _count(lit(1)).alias("#jobs"),
        _mean("steps_len").alias("#steps"),
        _mean("nthreads").alias("#nthreads"),
        _mean("ncores").alias("#ncores"),
        (_sum("jobCPU") / _count(lit(1))).alias("avg_jobCPU"),
        (_sum("jobTime") / _count(lit(1))).alias("avg_jobTime"),
        _collect_set("acquisitionEra").alias("acquisitionEra"),
    ).withColumn("avg_cpueff",
                 _col("avg_cpueff").cast(IntegerType())).toPandas()
    df_task = df.groupby(["task"]).agg(
        (100 * (_sum("jobCPU") / _mean("nthreads")) /
         _sum("jobTime")).alias("avg_cpueff"),
        _count(lit(1)).alias("#jobs"),
        _mean("steps_len").alias("#steps"),
        _mean("nthreads").alias("#nthreads"),
        _mean("ncores").alias("#ncores"),
        (_sum("jobCPU") / _count(lit(1))).alias("avg_jobCPU"),
        (_sum("jobTime") / _count(lit(1))).alias("avg_jobTime"),
    ).withColumn("avg_cpueff",
                 _col("avg_cpueff").cast(IntegerType())).toPandas()
    write_htmls(df_details, df_task, start_date, end_date, output_folder)