StructField("year", StringType(), True), StructField("month", StringType(), True) ]) raw_crime_df = spark.readStream \ .option("header", "false") \ .option("maxFilesPerTrigger", 2) \ .schema(schema) \ .csv(data_path) raw_crime_df.createOrReplaceTempView("CrimeData") print("Is the stream ready?", raw_crime_df.isStreaming) category_df = spark.sql( "SELECT major_category, value FROM CrimeData WHERE year = '2016'") crime_per_cat_df = category_df.groupBy("major_category")\ .agg(_sum("value").alias("convictions"))\ .orderBy(desc("convictions")) query = crime_per_cat_df.writeStream\ .outputMode("complete")\ .format("console")\ .option("truncate", "false")\ .option("numRows", 30)\ .start()\ .awaitTermination() # spark-submit --packages "org.apache.hadoop:hadoop-aws:2.7.4" com/dsm/files/sql_demo.py
def main(): """ TODO: Create html page Access time filter logic: - If "last_access_ts" is less than 3 months ago, then set "months_old" as 3, - If "last_access_ts" is less than 6 monthsa ago, then set "months_old" as 6, - If "last_access_ts" is less than 12 months ago, then set "months_old" as 12 The result includes only the datasets whose last access time are 12, 6 or 3 months ago. """ spark = get_spark_session() (df_contents_f_to_b, df_contents_b_to_d, df_replicas, df_dids_files, df_replicas_j_dids, df_files_complete) = prepare_spark_dataframes(spark) # =============================================================================== # Continue with joins # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # ------------------------------------------------------------------------------- # --- STEP-10 / Tests to check dataframes are okay ---: # df_block_file_rse.select("file").distinct().count() = is 29921156 # df_block_file_rse.filter(col("file").isNull()).count() = 0 # df_block_file_rse.filter(col("block").isNull()).count() = 57892 # Above line means, we cannot extract block names of 57892 file from CONTENTS table .. # .. which provides F:D and D:C mapping (file, dataset, container in Rucio terms) # df_block_file_rse.filter(col("rse_id").isNull()).count() = 0 # df_block_file_rse.filter(col("fsize").isNull()).count() = 0 # We are all good, just drop null block names. # STEP-10: Left join df_files_complete and df_contents_f_to_b to get block names of files. # - There are some files that we cannot extract their block names from CONTENTS table # - So filter out them. df_block_file_rse = df_files_complete \ .join(df_contents_f_to_b, ["file"], how="left") \ .select(['block', 'file', 'rse_id', 'accessed_at', 'fsize', ]) \ .filter(col("block").isNotNull()) \ .cache() # --- STEP-11 / Tests to check dataframes are okay ---: # df_all.filter(col("dataset").isNull()).count() = 280821 # STEP-11: Left join df_block_file_rse and df_contents_b_to_d to get dataset names of blocks&files. # - There are some blocks that we cannot extract their dataset names from CONTENTS table. # - So filter out them. df_all = df_block_file_rse \ .join(df_contents_b_to_d, ["block"], how="left") \ .select(['dataset', 'block', 'file', 'rse_id', 'accessed_at', 'fsize']) \ .filter(col("dataset").isNotNull()) \ .cache() # STEP-12: Group by "dataset" and "rses" to calculate: # - dataset_size_in_rse: total size of dataset in a RSE by summing up dataset's all files in that RSE. # - `last_access_time_of_dataset_per_rse`: last access time of dataset in a RSE ... # ... by getting max of file `accessed_at` field of dataset's all files in that RSE. # - `#files_null_access_time_per_rse`: number of files which has NULL `accessed_at` field ... # ... in each dataset in a RSE. ... # ... This important to know to filter out if there is any NULL accessed_at file in calculation. # - `#files_per_rse`: number of files od the dataset in that RSE # - `#files_unique_per_rse`: unique count of dataset files in that RSE # Final result will be like: one dataset can be in multiple RSEs and presumably ... # ... it may have different sizes since a dataset may lost one of its block or file in a RSE? df_final_dataset_rse = df_all \ .groupby(["dataset", "rse_id"]) \ .agg(_sum(col("fsize")).alias("dataset_size_in_rse"), _max(col("accessed_at")).alias("last_access_time_of_dataset_per_rse"), _sum(when(col("accessed_at").isNull(), 1).otherwise(0)).alias("#files_null_access_time_per_rse"), _count(lit(1)).alias("#files_per_rse"), countDistinct(col("file")).alias("#files_unique_per_rse"), ) \ .cache() # STEP-13: Get thresholds. They are unix timestamps which are 3, 6 and 12 months ago from today. ts_thresholds = get_ts_thresholds() # STEP-14: # Filter for calculating last_accessed_at_least_{12|6|3}_months_ago columns. # - To produce correct results, "last_access_time_of_dataset_per_rse" field should not be null # which means a dataset's all files' accessed_at fields are filled. # - And "#files_null_access_time_per_rse"==0 means that there should not be ... # any file with NULL "accessed_at" field. # Group by dataset to get final result from all RSEs' datasets. # - max_dataset_size(TB): max size of dataset in all RSEs that contain this dataset # - max_dataset_size(TB): min size of dataset in all RSEs that contain this dataset # - max_dataset_size(TB): avg size of dataset in all RSEs that contain this dataset # - last_access_time_of_dataset: last access time of dataset in all RSEs df = df_final_dataset_rse \ .filter(col("last_access_time_of_dataset_per_rse").isNotNull() & (col("#files_null_access_time_per_rse") == 0) ) \ .groupby(["dataset"]) \ .agg(_round(_max(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("max_dataset_size(TB)"), _round(_min(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("min_dataset_size(TB)"), _round(_avg(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("avg_dataset_size(TB)"), _sum(col("#files_null_access_time_per_rse")).alias("#files_null_access_time_per_dataset"), _max(col("last_access_time_of_dataset_per_rse")).alias("last_access_time_of_dataset"), ) \ .withColumn('last_access_more_than_12_months_ago', when(col('last_access_time_of_dataset') < ts_thresholds[12], 1).otherwise(0) ) \ .withColumn('last_access_more_than_6_months_ago', when(col('last_access_time_of_dataset') < ts_thresholds[6], 1).otherwise(0) ) \ .withColumn('last_access_more_than_3_months_ago', when(col('last_access_time_of_dataset') < ts_thresholds[3], 1).otherwise(0) ) \ .filter((col('last_access_more_than_12_months_ago') == 1) | (col('last_access_more_than_6_months_ago') == 1) | (col('last_access_more_than_3_months_ago') == 1) ) \ .cache() # STEP-15: Find datasets which have only null accessed_at fields in its files df_all_null_accessed_at = df_final_dataset_rse \ .filter(col("last_access_time_of_dataset_per_rse").isNull()) \ .groupby(["dataset"]) \ .agg(_round(_max(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("max_dataset_size(TB)"), _round(_min(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("min_dataset_size(TB)"), _round(_avg(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("avg_dataset_size(TB)"), _sum(col("#files_null_access_time_per_rse")).alias("#files_null_access_time_per_dataset"), _max(col("last_access_time_of_dataset_per_rse")).alias("last_access_time_of_dataset"), ) \ .cache() # Total for not null data: not read more than 3,6,12 months which is equal to more than 3 months values. df.select([ "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)" ]).groupBy().sum().show() # For 12 months df.filter(col("last_access_more_than_12_months_ago") == 1).select([ "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)" ]).groupBy().sum().show() print(df.filter(col("last_access_more_than_12_months_ago") == 1).count()) # For 6 months df.filter(col("last_access_more_than_6_months_ago") == 1).select([ "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)" ]).groupBy().sum().show() print(df.filter(col("last_access_more_than_6_months_ago") == 1).count()) # For 3 months df.filter(col("last_access_more_than_3_months_ago") == 1).select([ "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)" ]).groupBy().sum().show() print(df.filter(col("last_access_more_than_3_months_ago") == 1).count()) # For all null accessed_at(all files) datasets df_all_null_accessed_at.select([ "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)" ]).groupBy().sum().show() print(df_all_null_accessed_at.count()) return df, df_all_null_accessed_at
def create_main_df(spark, hdfs_paths, base_eos_dir): # UTC timestamp of start hour of spark job ts_current_hour = int(datetime.utcnow().replace( minute=0, second=0, microsecond=0, tzinfo=timezone.utc).timestamp() * 1000) # ----------------------------------------------------------------------------------------------------------------- # -- ================== Prepare main Spark dataframes =========================== # Get RSES id, name, type, tier, country, kind from RSES table dump df_rses = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['RSES']) \ .filter(col('DELETED_AT').isNull()) \ .withColumn('replica_rse_id', lower(_hex(col('ID')))) \ .withColumnRenamed('RSE', 'rse') \ .withColumnRenamed('RSE_TYPE', 'rse_type') \ .withColumn('rse_tier', _split(col('rse'), '_').getItem(0)) \ .withColumn('rse_country', _split(col('rse'), '_').getItem(1)) \ .withColumn('rse_kind', when(col("rse").endswith('Temp'), 'temp') .when(col("rse").endswith('Test'), 'test') .otherwise('prod') ) \ .select(['replica_rse_id', 'rse', 'rse_type', 'rse_tier', 'rse_country', 'rse_kind']) # Rucio Dataset(D) refers to dbs block, so we used DBS terminology from the beginning df_contents_f_to_b = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['CONTENTS']) \ .filter(col("SCOPE") == "cms") \ .filter(col("DID_TYPE") == "D") \ .filter(col("CHILD_TYPE") == "F") \ .withColumnRenamed("NAME", "block") \ .withColumnRenamed("CHILD_NAME", "file") \ .select(["block", "file"]) # Rucio Dataset(D) refers to dbs block; Rucio Container(C) refers to dbs dataset. # We used DBS terminology from the beginning df_contents_b_to_d = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['CONTENTS']) \ .filter(col("SCOPE") == "cms") \ .filter(col("DID_TYPE") == "C") \ .filter(col("CHILD_TYPE") == "D") \ .withColumnRenamed("NAME", "dataset") \ .withColumnRenamed("CHILD_NAME", "block") \ .select(["dataset", "block"]) # Get file to dataset map df_contents_ds_files = df_contents_f_to_b.join(df_contents_b_to_d, ["block"], how="left") \ .filter(col('file').isNotNull()) \ .filter(col('dataset').isNotNull()) \ .withColumnRenamed('dataset', 'contents_dataset') \ .withColumn('is_d_name_from_rucio', lit(BOOL_STR[True])) \ .select(["contents_dataset", "file", "is_d_name_from_rucio"]) dbs_files = spark.read.format('avro').load(hdfs_paths['FILES']) \ .withColumnRenamed('LOGICAL_FILE_NAME', 'file') \ .withColumnRenamed('DATASET_ID', 'dbs_file_ds_id') \ .withColumnRenamed('FILE_SIZE', 'dbs_file_size') \ .select(['file', 'dbs_file_ds_id', 'dbs_file_size']) dbs_datasets = spark.read.format('avro').load(hdfs_paths['DATASETS']) df_dbs_ds_files = dbs_files.join(dbs_datasets.select(['DATASET_ID', 'DATASET']), dbs_files.dbs_file_ds_id == dbs_datasets.DATASET_ID, how='left') \ .filter(col('file').isNotNull()) \ .filter(col('DATASET').isNotNull()) \ .withColumnRenamed('dbs_file_ds_id', 'dbs_dataset_id') \ .withColumnRenamed('DATASET', 'dbs_dataset') \ .withColumn('is_d_name_from_dbs', lit(BOOL_STR[True])) \ .select(['file', 'dbs_dataset', 'is_d_name_from_dbs']) # Prepare replicas df_replicas = spark.read.format('avro').load(hdfs_paths['REPLICAS']) \ .filter(col("SCOPE") == "cms") \ .withColumn('replica_rse_id', lower(_hex(col('RSE_ID')))) \ .withColumn('replica_file_size', col('BYTES').cast(LongType())) \ .withColumnRenamed('NAME', 'file') \ .withColumnRenamed('ACCESSED_AT', 'replica_accessed_at') \ .withColumnRenamed('CREATED_AT', 'replica_created_at') \ .withColumnRenamed('LOCK_CNT', 'lock_cnt') \ .withColumnRenamed('STATE', 'state') \ .select(['file', 'replica_rse_id', 'replica_file_size', 'replica_accessed_at', 'replica_created_at', 'lock_cnt']) # Create enriched file df which adds dbs file size to replicas files. Left join select only replicas files df_files_enriched_with_dbs = df_replicas \ .join(dbs_files.select(['file', 'dbs_file_size']), ['file'], how='left') \ .withColumn('joint_file_size', when(col('replica_file_size').isNotNull(), col('replica_file_size')) .when(col('dbs_file_size').isNotNull(), col('dbs_file_size')) ) \ .select(['file', 'replica_rse_id', 'replica_accessed_at', 'replica_created_at', 'lock_cnt', 'replica_file_size', 'dbs_file_size', 'joint_file_size']) # ----------------------------------------------------------------------------------------------------------------- # -- ================== only Rucio: Replicas and Contents ======================= -- df_only_from_rucio = df_replicas \ .join(df_contents_ds_files, ['file'], how='left') \ .select(['contents_dataset', 'file', 'replica_rse_id', 'replica_file_size', 'replica_accessed_at', 'replica_created_at', 'is_d_name_from_rucio', 'lock_cnt']) # Use them in outer join # _max(col('replica_accessed_at')).alias('rucio_last_accessed_at'), # _max(col('replica_created_at')).alias('rucio_last_created_at'), df_only_from_rucio = df_only_from_rucio \ .groupby(['replica_rse_id', 'contents_dataset']) \ .agg(_sum(col('replica_file_size')).alias('rucio_size'), _count(lit(1)).alias('rucio_n_files'), _sum( when(col('replica_accessed_at').isNull(), 0) .otherwise(1) ).alias('rucio_n_accessed_files'), _first(col("is_d_name_from_rucio")).alias("is_d_name_from_rucio"), _sum(col('lock_cnt')).alias('rucio_locked_files') ) \ .withColumn('rucio_is_d_locked', when(col('rucio_locked_files') > 0, IS_DATASET_LOCKED[True]) .otherwise(IS_DATASET_LOCKED[False]) ) \ .select(['contents_dataset', 'replica_rse_id', 'rucio_size', 'rucio_n_files', 'rucio_n_accessed_files', 'is_d_name_from_rucio', 'rucio_locked_files', 'rucio_is_d_locked', ]) # ----------------------------------------------------------------------------------------------------------------- # -- ================= only DBS: Replicas, Files, Datasets ====================== -- # Of course only files from Replicas processed, select only dbs related fields df_only_from_dbs = df_files_enriched_with_dbs \ .select(['file', 'replica_rse_id', 'dbs_file_size', 'replica_accessed_at', 'lock_cnt']) \ .join(df_dbs_ds_files, ['file'], how='left') \ .filter(col('dbs_dataset').isNotNull()) \ .select(['file', 'dbs_dataset', 'replica_rse_id', 'dbs_file_size', 'replica_accessed_at', 'is_d_name_from_dbs', 'lock_cnt']) df_only_from_dbs = df_only_from_dbs \ .groupby(['replica_rse_id', 'dbs_dataset']) \ .agg(_sum(col('dbs_file_size')).alias('dbs_size'), _count(lit(1)).alias('dbs_n_files'), _sum( when(col('replica_accessed_at').isNull(), 0) .otherwise(1) ).alias('dbs_n_accessed_files'), _first(col("is_d_name_from_dbs")).alias("is_d_name_from_dbs"), _sum(col('lock_cnt')).alias('dbs_locked_files') ) \ .withColumn('dbs_is_d_locked', when(col('dbs_locked_files') > 0, IS_DATASET_LOCKED[True]) .otherwise(IS_DATASET_LOCKED[False]) ) \ .select(['dbs_dataset', 'replica_rse_id', 'dbs_size', 'dbs_n_files', 'dbs_n_accessed_files', 'is_d_name_from_dbs', 'dbs_locked_files', 'dbs_is_d_locked']) # Full outer join of Rucio and DBS to get all dataset-file maps df_dataset_file_map_enr = df_contents_ds_files.join(df_dbs_ds_files, ['file'], how='full') # ----------------------------------------------------------------------------------------------------------------- # -- ====== check files do not have dataset name ============ -- # Check Replicas files do not have dataset name in Contents, DBS or both x = df_replicas.join(df_dataset_file_map_enr, ['file'], how='left') \ .select(['contents_dataset', 'dbs_dataset', 'file']) y_contents = x.filter(col('contents_dataset').isNull()) z_dbs = x.filter(col('dbs_dataset').isNull()) t_both = x.filter( col('contents_dataset').isNull() & col('dbs_dataset').isNull()) stats_dict = { "Replicas files do not have dataset name in Contents": y_contents.select('file').distinct().count(), "Replicas files do not have dataset name in DBS": z_dbs.select('file').distinct().count(), "Replicas files do not have dataset name neither in Contents nor DBS": t_both.select('file').distinct().count() } write_stats_to_eos(base_eos_dir, stats_dict) del x, y_contents, z_dbs, t_both # ----------------------------------------------------------------------------------------------------------------- # -- ====== joint Rucio and DBS: Replicas, Contents, Files, Datasets ============ -- # Main aim is to get all datasets of files df_dataset_file_map_enr = df_dataset_file_map_enr \ .withColumn("dataset", when(col("contents_dataset").isNotNull(), col("contents_dataset")) .when(col("dbs_dataset").isNotNull(), col("dbs_dataset")) ) \ .withColumn("is_ds_from_rucio", when(col("is_d_name_from_rucio").isNotNull(), 1).otherwise(0)) \ .withColumn("is_ds_from_dbs", when(col("is_d_name_from_dbs").isNotNull(), 1).otherwise(0)) \ .select(['dataset', 'file', 'is_ds_from_dbs', 'is_ds_from_rucio']) df_joint_ds_files = df_files_enriched_with_dbs \ .select(['file', 'replica_rse_id', 'replica_accessed_at', 'replica_created_at', 'joint_file_size', 'lock_cnt']) \ .join(df_dataset_file_map_enr, ['file'], how='left') \ .filter(col('dataset').isNotNull()) \ .select(['dataset', 'file', 'is_ds_from_dbs', 'is_ds_from_rucio', 'replica_rse_id', 'replica_accessed_at', 'replica_created_at', 'joint_file_size', 'lock_cnt']) df_joint_main = df_joint_ds_files \ .groupby(['replica_rse_id', 'dataset']) \ .agg(_sum(col('joint_file_size')).alias('joint_size'), _max(col('replica_accessed_at')).alias('joint_last_accessed_at'), _max(col('replica_created_at')).alias('joint_last_created_at'), _sum(col('is_ds_from_dbs')).alias('joint_dbs_n_files'), _sum(col('is_ds_from_rucio')).alias('joint_rucio_n_files'), _count(lit(1)).alias('joint_n_files'), _sum( when(col('replica_accessed_at').isNull(), 0).otherwise(1) ).alias('joint_n_accessed_files'), _sum(col('lock_cnt')).alias('joint_locked_files') ) \ .withColumn('all_f_in_dbs', when((col('joint_dbs_n_files') == 0) & (col('joint_dbs_n_files').isNull()), IS_ALL_DATASET_FILES_EXISTS['n']) .when(col('joint_dbs_n_files') == col('joint_n_files'), IS_ALL_DATASET_FILES_EXISTS['a']) .when(col('joint_dbs_n_files') > 0, IS_ALL_DATASET_FILES_EXISTS['p']) ) \ .withColumn('all_f_in_rucio', when((col('joint_rucio_n_files') == 0) & (col('joint_rucio_n_files').isNull()), IS_ALL_DATASET_FILES_EXISTS['n']) .when(col('joint_rucio_n_files') == col('joint_n_files'), IS_ALL_DATASET_FILES_EXISTS['a']) .when(col('joint_rucio_n_files') > 0, IS_ALL_DATASET_FILES_EXISTS['p']) ) \ .withColumn('joint_is_d_locked', when(col('joint_locked_files') > 0, IS_DATASET_LOCKED[True]) .otherwise(IS_DATASET_LOCKED[False]) ) \ .withColumnRenamed("replica_rse_id", "rse_id") \ .select(['dataset', 'rse_id', 'joint_size', 'joint_last_accessed_at', 'joint_last_created_at', 'joint_dbs_n_files', 'joint_rucio_n_files', 'joint_n_files', 'joint_n_accessed_files', 'all_f_in_dbs', 'all_f_in_rucio', 'joint_locked_files', 'joint_is_d_locked' ]) # ----------------------------------------------------------------------------------------------------------------- # -- ============ Dataset enrichment with Dataset tags ============ -- # Enrich dbs dataset with names from id properties of other tables dbs_data_tiers = spark.read.format('avro').load(hdfs_paths['DATA_TIERS']) dbs_physics_group = spark.read.format('avro').load( hdfs_paths['PHYSICS_GROUPS']) dbs_acquisition_era = spark.read.format('avro').load( hdfs_paths['ACQUISITION_ERAS']) dbs_dataset_access_type = spark.read.format('avro').load( hdfs_paths['DATASET_ACCESS_TYPES']) dbs_datasets_enr = dbs_datasets \ .join(dbs_data_tiers, ['data_tier_id'], how='left') \ .join(dbs_physics_group, ['physics_group_id'], how='left') \ .join(dbs_acquisition_era, ['acquisition_era_id'], how='left') \ .join(dbs_dataset_access_type, ['dataset_access_type_id'], how='left') \ .select(['dataset', 'dataset_id', 'is_dataset_valid', 'primary_ds_id', 'processed_ds_id', 'prep_id', 'data_tier_id', 'data_tier_name', 'physics_group_id', 'physics_group_name', 'acquisition_era_id', 'acquisition_era_name', 'dataset_access_type_id', 'dataset_access_type']) # ----------------------------------------------------------------------------------------------------------------- # -- ============ Main: join all ============ -- cond_with_only_rucio = [ df_joint_main.dataset == df_only_from_rucio.contents_dataset, df_joint_main.rse_id == df_only_from_rucio.replica_rse_id ] cond_with_only_dbs = [ df_joint_main.dataset == df_only_from_dbs.dbs_dataset, df_joint_main.rse_id == df_only_from_dbs.replica_rse_id ] # Left joins: since df_join_main has outer join, should have all datasets of both Rucio and DBS df_main = df_joint_main.join(df_only_from_rucio, cond_with_only_rucio, how='left').drop('replica_rse_id') df_main = df_main.join(df_only_from_dbs, cond_with_only_dbs, how='left').drop('replica_rse_id') df_main = df_main \ .withColumn('rucio_has_ds_name', when(col('is_d_name_from_rucio').isNotNull(), col('is_d_name_from_rucio')) .otherwise(BOOL_STR[False])) \ .withColumn('dbs_has_ds_name', when(col('is_d_name_from_dbs').isNotNull(), col('is_d_name_from_dbs')) .otherwise(BOOL_STR[False])) # Remove unneeded columns by selecting specific ones df_main = df_main.select([ 'dataset', 'rse_id', 'joint_size', 'joint_last_accessed_at', 'joint_last_created_at', 'joint_dbs_n_files', 'joint_rucio_n_files', 'joint_n_files', 'joint_n_accessed_files', 'all_f_in_dbs', 'all_f_in_rucio', 'rucio_size', 'rucio_n_files', 'rucio_n_accessed_files', 'rucio_has_ds_name', 'dbs_size', 'dbs_n_files', 'dbs_n_accessed_files', 'dbs_has_ds_name', 'rucio_locked_files', 'rucio_is_d_locked', 'dbs_locked_files', 'dbs_is_d_locked', 'joint_locked_files', 'joint_is_d_locked' ]) # Add DBS dataset enrichment's to main df df_main = df_main.join(dbs_datasets_enr, ['dataset'], how='left') # Add RSES name, type, tier, country, kind to dataset df_main = df_main \ .join(df_rses, df_main.rse_id == df_rses.replica_rse_id, how='left') \ .drop('rse_id', 'replica_rse_id') # UTC timestamp of start hour of the spark job df_main = df_main.withColumn('tstamp_hour', lit(ts_current_hour)) # Fill null values of string type columns. Null values is hard to handle in ES queries. df_main = df_main.fillna(value=NULL_STR_TYPE_COLUMN_VALUE, subset=STR_TYPE_COLUMNS) return df_main
# print(df.count()) # Spark india trade keys = [] values = [] spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.read.csv(path="DataSources/india-trade-data/2018-2010_import.csv", header="true") df = df.select(df.country, df.value.cast('float').alias('value')).where( df.value.isNotNull()) df = df.groupBy("country").agg(_sum("value").alias("sum_val")) df = df.select(df.country, df.sum_val.cast('int').alias('total')).orderBy('total', ascending=False) rows = df.limit(10).collect() for r in rows: keys.append(r[0]) values.append(r[1]) explode = [] for i in range(len(values)): explode.append(0) explode[0] = 0.1 # only "explode" the largest slide fig1, ax1 = plt.subplots() ax1.pie(values, explode=explode, labels=keys,
combined_df = lines.withColumn("tx_id",splittedClms.getItem(0).cast("integer")) \ .withColumn("product_id",splittedClms.getItem(1).cast("integer")) \ .withColumn("qty",splittedClms.getItem(2).cast("integer")) \ .withColumn("amt",splittedClms.getItem(3).cast("integer")) \ .withColumn("day_dt",splittedClms.getItem(4).cast("string")) streamingDf = combined_df.select("tx_id", "product_id", "qty", "amt", "day_dt", "timestamp") joinedDf = streamingDf.join(productDf, "product_id").select( "tx_id", "product_id", "name", "qty", "amt", "timestamp") # inner equi-join with a static DF aggDf = joinedDf.groupBy(window(joinedDf.timestamp, "2 minutes", "1 minutes"), joinedDf.product_id, joinedDf.name).agg( _sum("qty"), _sum("amt")).sort(joinedDf.product_id) # TODO: .sort(joinedDf.columns[1]) # What is 2 minutes and 1 minutes here? a: time_window and refresh frequence (every 1 minute) query = aggDf \ .writeStream \ .outputMode("complete") \ .format("console") \ .option("truncate", "false").start() # 'complete' mean all the rows of input since start point, even it is not changed query.awaitTermination()
# calculate min and max or order date in order to calculate recency max_order_date, min_order_date = init_flat_data \ .select( _max(col('order_date')), _min(col('order_date'))) \ .take(1)[0] # calculate recency/frequency and monetary calculate_diff_day = udf(lambda x: (max_order_date - x).days, IntegerType()) rfm_table = init_flat_data \ .withColumn('recency', calculate_diff_day('order_date')) \ .groupby(['company_id', 'company_name', 'country']) \ .agg( _mean(col('recency')).alias('recency'), _count(col('order_id')).alias('frequency'), _sum(col('NBI')).alias('monetary') ) # calculate quantiles for each variable quantiles = rfm_table.approxQuantile(['recency', 'frequency', 'monetary'], [0.20, 0.4, 0.6, 0.8], 0) r_quantile = quantiles[0] f_quantile = quantiles[1] m_quantile = quantiles[2] # calculate score of each variable def_r_score = udf( lambda x: 5 if x < r_quantile[0] else 4 if x < r_quantile[1] else 3 if x < r_quantile[2] else 2 if x < r_quantile[3] else 1, IntegerType()) def_f_score = udf( lambda x: 1 if x < f_quantile[0] else 2 if x < f_quantile[1] else 3
engine = create_engine(get_engine_uri("mysql", "pymysql"), pool_size=50) save_videos_table(videos_df.rdd.collect(), engine) # Process trending videos #TODO: Replace with an env variable if len(df.take(10)) < 10: # insufficient videos print("Too few videos, exiting...") exit(0) trending_df = sql_reader.options(**sql_config).option("dbtable", "trending").load() trending_df = trending_df \ .sort("timestamp", ascending=False) \ .limit(10) \ .groupBy("video_id") \ .agg(_sum("views").alias("c_views")) ts = current_timestamp() trending_df = df.join(trending_df, on='video_id', how='left_outer').na.fill(0, "c_views") trending_df = trending_df \ .withColumn("sum", col("views") + col("c_views")) \ .drop("views") \ .drop("c_views") \ .drop("id") \ .withColumnRenamed("sum", "views") \ .withColumn("timestamp", ts) \ .sort("views", ascending=False) \ .limit(10) trending_df.write \ .mode("append") \
# Process data based on max date (where dates greater than the record date are missing) df4_max = df3.filter(col("_diff_max") > 0) df5_max = df4_max.withColumn("_next_dates", get_next_dates_udf(date_column, "_diff_max")) # Process data based on min date (where dates less than the record date are missing) df4_min = df3.filter(col("_diff_min") < 0) df5_min = df4_min.withColumn("_next_dates", get_prev_dates_udf(date_column, "_diff_min")) # Combine dataframes for all missing data df5 = df5_max.union(df5_min) # Add dummy value df6 = df5.withColumn(fill_column, lit(0)) # Explode dates df7 = df6.withColumn(date_column, explode("_next_dates")) # Drop columns that were added for processing df8 = df7.drop("max_dt", "min_dt","_diff_max", "_diff_min", "_next_dates") # Drop duplicates df9 = df8.dropDuplicates() # Combine with base dataframe df10 = df9.union(df1) df10.dropDuplicates() # Aggregate to get rid of rows from the exploded data that are already present in the base dataframe df11 = df10.groupBy("dt","product","channel").agg(_sum("quantity")) df11.sort("dt").show(10)
plt.title("Daily cases") plt.xlabel("Date") plt.ylabel("Count") plt.rcParams["figure.figsize"] = (30, 5) #Show cumulative confirmed cases by day cases_group_by_date.toPandas().plot(x="date", y="cumulativeConfirmed") plt.title("Daily Cumulative Confirmed Cases") plt.xlabel("Date") plt.ylabel("Count") plt.rcParams["figure.figsize"] = (30, 5) #Show dailyConfirmed cases sum group by dayOfWeek cases_group_by_day_of_week = cases_group_by_date\ .groupby(['dayOfWeek'])\ .agg(_sum('dailyConfirmed'))\ .orderBy('dayOfWeek',asending=True) cases_group_by_day_of_week.show() cases_group_by_day_of_week.toPandas().plot(kind='bar') plt.rcParams["figure.figsize"] = (5, 5) #Show cases group by categories as imported, local cases not residing dorm, local cases residing dorms case_categories_columns = ["Case Type", "Count"] case_categories = [ ("Imported Cases", cases_group_by_date.groupBy().sum('dailyImported').collect()[0][0]), ("Local Transmission Cases", cases_group_by_date.groupBy().sum( 'localCaseNotResidingInDorms').collect()[0][0]), ("Local Case Reside Dorms", cases_group_by_date.groupBy().sum( 'localCaseResidingInDorms').collect()[0][0])
def process_and_get_pd_dfs(spark, start_date, end_date): schema = _get_schema() raw_df = ( spark.read.option('basePath', DEFAULT_HDFS_FOLDER).json( get_candidate_files(start_date, end_date, spark, base=DEFAULT_HDFS_FOLDER), schema=schema, ).select('data.*').filter( col("RecordTime").between(f"{start_date.timestamp() * 1000}", f"{end_date.timestamp() * 1000}")). filter( (col('Site') == 'T3_US_ANL') | # ANL (col('Site') == 'T3_US_NERSC') | # NERSC (col('Site') == 'T3_US_OSG') | # OSG (col('Site') == 'T3_US_PSC') | # PSC (col('Site') == 'T3_US_SDSC') | # SDSC (col('Site') == 'T3_US_TACC') | # TACC ((col('Site').endswith('_ES_PIC_BSC')) & (col('MachineAttrCMSSubSiteName0') == 'PIC-BSC')) | # BSC ((col('Site') == 'T1_IT_CNAF') & (col('MachineAttrCMSSubSiteName0') == 'CNAF-CINECA')) | # CINECA ((col('Site') == 'T1_DE_KIT') & (col('MachineAttrCMSSubSiteName0') == 'KIT-HOREKA')) | # HOREKA ((col('Site') == 'T2_DE_RWTH') & (col('MachineAttrCMSSubSiteName0') == 'RWTH-HPC')) # RWTH ).filter(col('Status').isin([ 'Running', 'Completed' ])).withColumn('date', from_unixtime( (col('RecordTime') / 1000))).withColumn( 'site_name', when(col('Site') == 'T3_US_ANL', lit("ANL")).when( col('Site') == 'T3_US_NERSC', lit("NERSC")).when( col('Site') == 'T3_US_OSG', lit("OSG")).when( col('Site') == 'T3_US_PSC', lit("PSC")).when( col('Site') == 'T3_US_SDSC', lit("SDSC")).when( col('Site') == 'T3_US_TACC', lit("TACC")). when( col('Site').endswith('_ES_PIC_BSC'), lit("BSC")).when( col('MachineAttrCMSSubSiteName0') == 'CNAF-CINECA', lit("CINECA")).when( col('MachineAttrCMSSubSiteName0') == 'KIT-HOREKA', lit("HOREKA")). when( col('MachineAttrCMSSubSiteName0') == 'RWTH-HPC', lit("RWTH"))).withColumn( "RequestCpus", when( col("RequestCpus").isNotNull(), col("RequestCpus")).otherwise(lit(1)), ).withColumn('dayofmonth', _dayofmonth( col('date'))).withColumn( 'month', concat_ws( '-', _year(col('date')), format_string('%02d', _month( col('date')))) # 2-digit month, default 1 ).drop( 'Site', 'MachineAttrCMSSubSiteName0').withColumnRenamed( 'site_name', 'site')) # There should be only Completed status for a GlobalJobId df_core_hr = raw_df.filter(col('Status') == 'Completed') \ .drop_duplicates(["GlobalJobId"]) df_core_hr_daily = df_core_hr.groupby(['site', 'month', 'dayofmonth']) \ .agg(_round(_sum("CoreHr")).alias("sum CoreHr")) df_core_hr_monthly = df_core_hr.groupby(['site', 'month']) \ .agg(_round(_sum("CoreHr")).alias("sum CoreHr")) sec_12_min = 60 * 12 time_window_12m = from_unixtime( unix_timestamp('date') - unix_timestamp('date') % sec_12_min) # 1st group-by includes GlobaljobId to get running cores of GlobaljobId without duplicates in each 12 minutes window # 2nd group-by gets sum of RequestCpus in 12 minutes window # 3rd group-by gets avg of RequestCpus(12 minutes window) for each site for each month df_running_cores_daily = raw_df \ .withColumn('12m_window', time_window_12m) \ .groupby(['site', 'month', 'dayofmonth', '12m_window', 'GlobalJobId']) \ .agg(_max(col('RequestCpus')).alias('running_cores_of_single_job_in_12m')) \ .groupby(['site', 'month', 'dayofmonth', '12m_window']) \ .agg(_sum(col('running_cores_of_single_job_in_12m')).alias('running_cores_12m_sum')) \ .groupby(['site', 'month', 'dayofmonth']) \ .agg(_round(_avg(col('running_cores_12m_sum'))).alias('running_cores_avg_over_12m_sum')) return df_core_hr_daily.toPandas(), df_running_cores_daily.toPandas( ), df_core_hr_monthly.toPandas()
purchaseDenseRank = dense_rank().over(windowSpec) purchaseRank = rank().over(windowSpec) dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")\ .select( col("CustomerId"), col("date"), col("Quantity"), purchaseRank.alias("quantityRank"), purchaseDenseRank.alias("quantityDenseRank"), maxPurchaseQuantity.alias("maxPurchaseQuantity")).show(30) #rollup dfNoNull = dfWithDate.drop() dfNoNull.createOrReplaceTempView("dfNoNull") dfNoNull.show() rolledUpDF = dfNoNull.rollup("Date", "Country").agg(_sum("Quantity"))\ .selectExpr("Date", "Country", "`sum(Quantity)` as total_quantity").orderBy("Date") rolledUpDF.show() rolledUpDF.where("Country IS NULL").show() rolledUpDF.where("Date IS NULL").show() #cubes dfNoNull.cube("Date", "Country").agg(_sum(col("Quantity")))\ .select("Date", "Country", "sum(Quantity)").orderBy("Date").show() ################### ## rdd and dataframes a = spark.range(10).rdd b = spark.range(10).toDF("id").rdd.map(lambda row: row[0])
from_json(col("value").cast("string"), stock_schema).alias("value") ) trade_df = value_df.select("value.*") \ .withColumn("CreatedTime", to_timestamp(col("CreatedTime"), "yyyy-MM-dd HH:mm:ss"))\ .withColumn("Buy", expr("case when Type == 'BUY' then Amount else 0 end")) \ .withColumn("Sell", expr("case when Type == 'SELL' then Amount else 0 end")) # trade_df.printSchema() # water mark is made to set expiry time window_agg_df = trade_df \ .withWatermark("CreatedTime", "30 minute") \ .groupBy( window(col("CreatedTime"), "15 minute")) \ .agg(_sum("Buy").alias("TotalBuy"), _sum("Sell").alias("TotalSell")) # window_agg_df.printSchema() output_df = window_agg_df.select("window.start", "window.end", "TotalBuy", "TotalSell") """ # It will be used when want to perform bach processing running_total_window = Window.orderBy("end") \ .rowsBetween(Window.unboundedPreceding, Window.currentRow) final_output_df = output_df \ .withColumn("RTotalBuy", _sum("TotalBuy").over(running_total_window)) \ .withColumn("RTotalSell", _sum("TotalSell").over(running_total_window)) \ .withColumn("NetValue", expr("RTotalBuy - RTotalSell")) final_output_df.show(truncate=False)
def generate_cpu_eff_site( start_date=None, end_date=None, cms_type="production", output_folder="./www/cpu_eff", last_n_days=30, cpu_eff_outlier=0, ): """ """ _yesterday = datetime.combine(date.today() - timedelta(days=1), datetime.min.time()) if not (start_date or end_date): # defaults to the last 30 days with 3 days offset. # Default: (today-33days to today-3days) end_date = _yesterday start_date = end_date - timedelta(days=last_n_days) elif not start_date: start_date = end_date - timedelta(days=last_n_days) elif not end_date: end_date = min(start_date + timedelta(days=last_n_days), _yesterday) if start_date > end_date: raise ValueError( f"start date ({start_date}) should be earlier than end date({end_date})" ) group_type_map = { "production": ["Workflow", "WMAgent_RequestName"], # Order is important "analysis": ["Workflow"], "test": ["Workflow"], "folding@home": ["Workflow"], } # Should be a list, used also in dataframe merge conditions. group_by_col = group_type_map[cms_type] spark = get_spark_session() schema = _get_schema() raw_df = (spark.read.option("basePath", _DEFAULT_HDFS_FOLDER).json( get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER), schema=schema, ).select("data.*").filter(f"""Status='Completed' AND JobFailed=0 AND RecordTime >= {start_date.timestamp() * 1000} AND RecordTime < {end_date.timestamp() * 1000} AND Type = '{cms_type}' AND CpuEffOutlier = '{cpu_eff_outlier}' """).drop_duplicates(["GlobalJobId"])) raw_df = (raw_df.withColumn( "RequestCpus", when(col("RequestCpus").isNotNull(), col("RequestCpus")).otherwise(lit(1)), ).withColumn("CoreTime", col("WallClockHr") * col("RequestCpus")).withColumn( "Wasted_cputimehr", ((col("RequestCpus") * col("WallClockHr")) - col("CpuTimeHr")))).cache() grouped_tiers = raw_df.groupby("Tier", "Type", "CpuEffOutlier").agg( (100 * _sum("CpuTimeHr") / _sum("CoreTime")).alias("tier_cpueff"), _sum("RequestCpus").alias("tier_cpus"), _sum("CpuTimeHr").alias("tier_cputimehr"), _sum("WallClockHr").alias("tier_wallclockhr"), ).toPandas() grouped_wf = raw_df.groupby(*group_by_col, "Type").agg( (100 * _sum("CpuTimeHr") / _sum("CoreTime")).alias("wf_cpueff"), _sum("RequestCpus").alias("wf_cpus"), _sum("CpuTimeHr").alias("wf_cputimehr"), _sum("WallClockHr").alias("wf_wallclockhr"), _sum("Wasted_cputimehr").alias("wf_wasted_cputimehr"), ) grouped_wf_t1_t2 = raw_df.filter("""Tier='T1' OR Tier='T2'""").groupby( *group_by_col, "Type").agg( (100 * _sum("CpuTimeHr") / _sum("CoreTime")).alias("wf_cpueff_t1_t2"), _sum("CpuTimeHr").alias("wf_cputimehr_t1_t2"), _sum("WallClockHr").alias("wf_wallclockhr_t1_t2"), _sum("Wasted_cputimehr").alias("wf_wasted_cputimehr_t1_t2"), ) grouped_site_wf = raw_df.groupby(*group_by_col, "Site").agg( (100 * _sum("CpuTimeHr") / _sum("CoreTime")).alias("wf_site_cpueff"), _sum("RequestCpus").alias("wf_cpus"), _sum("CpuTimeHr").alias("wf_site_cputimehr"), _sum("WallClockHr").alias("wf_site_wallclockhr"), _sum("Wasted_cputimehr").alias("wf_site_wasted_cputimehr"), first("ScheddName").alias("schedd"), first("WMAgent_JobID").alias("wmagent_jobid"), ) select_expr = f"""wf_wallclockhr > 100""" selected_df = grouped_wf.where(select_expr) selected_pd = selected_df.toPandas() grouped_wf_t1_t2 = grouped_wf_t1_t2.toPandas() grouped_wf_t1_t2.drop(['Type'], axis=1, inplace=True) # Merge grouped_wf and grouped_wf_t1_t2 to see cpueff, cputimehr and wallclockhr values of (T1-T2 sites only) selected_pd = pd.merge(selected_pd, grouped_wf_t1_t2, how='left', left_on=group_by_col, right_on=group_by_col) workflow_column = selected_pd["Workflow"].copy() filter_column = (workflow_column if group_by_col[-1] == "Workflow" else selected_pd[group_by_col[-1]].copy()) main_page = _generate_main_page(selected_pd, grouped_tiers, start_date, end_date, cms_type, workflow_column, filter_column, cpu_eff_outlier) os.makedirs(output_folder, exist_ok=True) with open(f"{output_folder}/CPU_Efficiency_Table.html", "w") as ofile: ofile.write(main_page) # We are only interested on the selected workflows. site_wf = grouped_site_wf.where( col(filter_column.name).isin(filter_column.to_list())).toPandas() if cms_type == "production": site_klinks = site_kibana_links() site_wf["log"] = ( "<a href='https://cms-unified.web.cern.ch/cms-unified/logmapping/" + site_wf["WMAgent_RequestName"] + "/" + site_wf["schedd"] + "_" + site_wf["wmagent_jobid"] + ".tar.gz'>logs</a>") site_wf.drop(columns="schedd") site_wf["@Kibana"] = (site_klinks[0].format( START_DAY=(start_date + timedelta(seconds=time.altzone) ).strftime('%Y-%m-%dT%H:%M:%S.000Z'), END_DAY=(end_date + timedelta(seconds=time.altzone) ).strftime('%Y-%m-%dT%H:%M:%S.000Z')) + str(cpu_eff_outlier) + site_klinks[1] + site_wf["WMAgent_RequestName"] + site_klinks[2] + site_wf["Workflow"] + site_klinks[3] + site_wf["Site"] + site_klinks[4]) site_wf = site_wf.set_index([*group_by_col, "Site"]).sort_index() # Create one file per worflow, so we don't have a big file collapsing the browser. _folder = f"{output_folder}/wfbysite" os.makedirs(_folder, exist_ok=True) num_levels = len(group_by_col) for workflow, df in site_wf.groupby(filter_column.name): sublevels = "" if num_levels > 1: df_ni = df.reset_index() sublevels = ("/".join( df_ni[group_by_col[0:-1]].drop_duplicates().values[0].tolist()) + "/") os.makedirs(f"{_folder}/{sublevels}", exist_ok=True) df.droplevel(list(range(num_levels))).to_html( f"{_folder}/{sublevels}CPU_Efficiency_bySite_{workflow}.html", escape=False, )
def get_df_sub_not_read_since(df_dataset_file_rse_ts_size, filtered_rses_id_name_map, min_tb_limit, n_months_filter): """Get dataframe of datasets that are not read since N months for sub details htmls Group by 'dataset' and 'rse_id' of get_df_dataset_file_rse_ts_size Filters: - If a dataset contains EVEN a single file with null accessed_at, filter out Access time filter logic: - If 'last_access_time_of_dataset_in_all_rses' is less than 'n_months_filter', ... ... set 'is_not_read_since_{n_months_filter}_months' column as True Columns: - 'dataset_size_in_rse_tb' Total size of a Dataset in an RSE. Produced by summing up datasets' all files in that RSE. - 'last_access_time_of_dataset_in_rse' Last access time of a Dataset in an RSE. Produced by getting max `accessed_at`(represents single file's access time) of a dataset in an RSE. - '#files_with_null_access_time_of_dataset_in_rse' Number of files count, which have NULL `accessed_at` values, of a Dataset in an RSE. This is important to know to filter out if there is any NULL `accessed_at` value of a Dataset. - '#files_of_dataset_in_rse' Number of files count of a Dataset in an RSE - '#distinct_files_of_dataset_in_rse' Number of unique files count of dataset in an RSE df_main_datasets_and_rses: RSE name, dataset and their size and access time calculations """ # New boolean column name to map dataset-rse_id couples are not read at least n_months_filter or not bool_column_is_not_read_since_n_months = 'is_not_read_since_{}_months'.format( str(n_months_filter)) # Get reverted dict to get RSE names from id reverted_filtered_rses_id_name_map = get_reverted_rses_id_name_map( filtered_rses_id_name_map) return df_dataset_file_rse_ts_size \ .groupby(['rse_id', 'dataset']) \ .agg(_round(_sum(col('f_size')) / TB_DENOMINATOR, 5).alias('dataset_size_in_rse_tb'), _max(col('accessed_at')).alias('last_access_time_of_dataset_in_rse'), _sum( when(col('accessed_at').isNull(), 0).otherwise(1) ).alias('#_accessed_files_of_dataset_in_rse'), _count(lit(1)).alias('#_files_of_dataset_in_rse'), ) \ .withColumn(bool_column_is_not_read_since_n_months, when( col('last_access_time_of_dataset_in_rse') < get_n_months_ago_epoch_msec(n_months_filter), True).otherwise(False) ) \ .filter(col('last_access_time_of_dataset_in_rse').isNotNull()) \ .filter(col(bool_column_is_not_read_since_n_months)) \ .filter(col('dataset_size_in_rse_tb') > min_tb_limit) \ .replace(reverted_filtered_rses_id_name_map, subset=['rse_id']) \ .withColumnRenamed('rse_id', 'RSE name') \ .select(['RSE name', 'dataset', 'dataset_size_in_rse_tb', 'last_access_time_of_dataset_in_rse', '#_files_of_dataset_in_rse', '#_accessed_files_of_dataset_in_rse', ]) \ .cache()
def prepare_vxworks_data( file_tree_dataframe: DataFrame, vxworks_safety_features_dataframe: DataFrame) -> DataFrame: # Upcoming # # interrupt_stack_protection will need to consider whether or not the overflow / underflow # values are set to something non-zero in order for the protection to be valid # Currently that metadata doesn't exist # # user_stack_protection will need to check whether the overflow / underflow values # are non-zero, as well as the global_stack fill. For now, that metadata is either shaky # or does not exist. # # kernel_stack_protection will need to be considered. Currently this metadata # doesn't really exist, so the output will be rough. # # in write_protection, the vector table protection is _not_ a cross-platform feature, # and as such, computing statistics on it would be complex; we're going to avoid that for now vxworks_features_counted = vxworks_safety_features_dataframe.select( col('file_hash'), when( col('password_protection') == True, 1 ).otherwise(0).alias('password_protection_count'), when( col('interrupt_stack_protection.guard_zones') == True, 1 ).otherwise(0).alias('interrupt_stack_protection_count'), # If they have at least one of these two, we're going to give # them good boy points. when( (col('write_protection.user_text') == False) & \ (col('write_protection.kernel_text') == False), 0 # col('write_protection.virtual_mem_text') == True, 1 ).otherwise(1).alias('write_protection_count_preliminary'), col('write_protection.virtual_mem_text').alias('virtual_mem_text'), when( col('kernel_stack_protection.guard_overflow_size_exec').isNotNull() & \ col('kernel_stack_protection.guard_underflow_size_exec').isNotNull() & \ col('kernel_stack_protection.guard_overflow_size_exception').isNotNull(), 1 ).otherwise(None).alias('kernel_stack_protection_count'), when( (col('user_task_stack_protection.no_exec') == True) & \ (col('user_task_stack_protection.guard_zones') == True), 1 ).otherwise(0).alias('user_task_stack_protection_count'), when( col('file_hash').isNotNull(), 1 ).otherwise(0).alias('row_count') ).withColumn( # Nested filter: If user text protection and kernel text protection are # disabled but virtual mem text is enabled, then this facet is secure # Otherwise, we should call it insecure 'write_protection_count', when( (col('write_protection_count_preliminary') == 0) & \ (col('virtual_mem_text') == True), 1 ).otherwise( when( col('write_protection_count_preliminary') == 1, 1 ).otherwise(0) ) ).select( 'file_hash', 'row_count', 'write_protection_count', 'password_protection_count', 'kernel_stack_protection_count', 'interrupt_stack_protection_count', 'user_task_stack_protection_count', ).withColumn( 'has_one_security_feature', when( (col('write_protection_count') != 0) | \ (col('password_protection_count') != 0) | \ (col('kernel_stack_protection_count') != 0) | \ (col('interrupt_stack_protection_count') != 0) | \ (col('user_task_stack_protection_count') != 0), 1 ).otherwise(0) ) vxworks_safety_stats_df = vxworks_features_counted.join( file_tree_dataframe, 'file_hash').groupby('firmware_hash').agg( _sum('row_count').cast('int').alias('total_vxworks_count'), _sum('write_protection_count').cast('int').alias( 'write_protection'), _sum('password_protection_count').cast('int').alias( 'password_protection'), _sum('has_one_security_feature').cast('int').alias( 'count_with_security_features'), _sum('kernel_stack_protection_count').cast('int').alias( 'kernel_stack_protection'), _sum('interrupt_stack_protection_count').cast('int').alias( 'interrupt_stack_protection'), _sum('user_task_stack_protection_count').cast('int').alias( 'user_task_stack_protection'), ).select('firmware_hash', 'write_protection', 'password_protection', 'total_vxworks_count', 'kernel_stack_protection', 'interrupt_stack_protection', 'user_task_stack_protection', 'count_with_security_features') return vxworks_safety_stats_df
""" repartition based on 'LNAME' and 'Address' and generate spark_partiion_id then run mapPartitions() function and create in-partition idx """ df1 = df.repartition(N, 'LNAME', 'Address') \ .rdd.mapPartitionsWithIndex(func) \ .toDF() # get number of unique rows (based on Address+LNAME) which is max_idx # and then grab the running SUM of this rcnt # the new df should be small and just cache it w1 = Window.partitionBy().orderBy('partition_id').rowsBetween( Window.unboundedPreceding, -1) df2 = df1.groupby('partition_id') \ .agg((_max('idx')).alias('cnt')) \ .withColumn('rcnt', coalesce(_sum('cnt').over(w1),lit(0))) \ .cache() df2.show() #+------------+---+----+ #|partition_id|cnt|rcnt| #+------------+---+----+ #| 0| 3| 0| #| 1| 1| 3| #| 2| 1| 4| #| 4| 1| 5| #+------------+---+----+ """join df1 with df2 and create id = idx + rcnt""" df_new = df1.join(df2, on=['partition_id']).withColumn('id', col('idx') + col('rcnt'))
def prepare_vxworks_features_per_binary( file_tree_dataframe: DataFrame, vxworks_safety_features_dataframe: DataFrame) -> DataFrame: return vxworks_safety_features_dataframe.select( col('file_hash'), when( col('password_protection') == True, 1 ).otherwise(0).alias('password_protection_count'), when( col('interrupt_stack_protection.guard_zones') == True, 1 ).otherwise(0).alias('interrupt_stack_protection_count'), # If they have at least one of these two, we're going to give # them good boy points. when( (col('write_protection.user_text') == False) & \ (col('write_protection.kernel_text') == False), 0 # col('write_protection.virtual_mem_text') == True, 1 ).otherwise(1).alias('write_protection_count_preliminary'), col('write_protection.virtual_mem_text').alias('virtual_mem_text'), when( col('kernel_stack_protection.guard_overflow_size_exec').isNotNull() & \ col('kernel_stack_protection.guard_underflow_size_exec').isNotNull() & \ col('kernel_stack_protection.guard_overflow_size_exception').isNotNull(), 1 ).otherwise(None).alias('kernel_stack_protection_count'), when( (col('user_task_stack_protection.no_exec') == True) & \ (col('user_task_stack_protection.guard_zones') == True), 1 ).otherwise(0).alias('user_task_stack_protection_count') ).withColumn( # Nested filter: If user text protection and kernel text protection are # disabled but virtual mem text is enabled, then this facet is secure # Otherwise, we should call it insecure 'write_protection_count', when( (col('write_protection_count_preliminary') == 0) & \ (col('virtual_mem_text') == True), 1 ).otherwise( when( col('write_protection_count_preliminary') == 1, 1 ).otherwise(0) ) ).withColumn( 'ratio', ( col('write_protection_count') + \ col('password_protection_count') + \ when( col('kernel_stack_protection_count').isNotNull(), 1 ).otherwise(0) + \ col('interrupt_stack_protection_count') + \ col('user_task_stack_protection_count') ) / 5 ).select( 'file_hash', 'ratio' ).join( file_tree_dataframe, 'file_hash' ).groupBy( 'firmware_hash' ).agg( _sum( 'ratio' ).alias( 'vxworks_ratio_total' ), count( 'file_hash' ).alias('vxworks_binary_count') )
"userId", "buyId").pivot("price").sum("price") # In[48]: buyclicks_pivotPrice.orderBy("userId").show(15) # In[49]: # ORDERED table by USER, SESSION, TIME, ITEM buyclicks_raw.orderBy("userId", "userSessionId", "timestamp", "buyId").show(15) # In[50]: # TOTAL SPENT BY USER buyclicks_total_by_user = buyclicks_raw.groupBy("userId").agg( _sum("price").alias("totalspent")).orderBy("totalspent", ascending=False) # In[51]: buyclicks_total_by_user.show(5) # In[ ]: # ## AD CLICKS # In[52]: adclicks_raw.count() # In[53]:
def prepare_crypto_material_data( file_tree_dataframe: DataFrame, crypto_material_dataframe: DataFrame) -> DataFrame: # yapf: disable vulnerable_crypto_materials_stats_dataframe = crypto_material_dataframe.select( when( col('material_type') == 'SshRsaPrivateKeyBlock', 1 ).otherwise(0).alias('has_ssh_rsa_private_key'), when( col('material_type') == 'SshRsaPublicKeyBlock', 1 ).otherwise(0).alias('has_ssh_rsa_public_key'), when( col('material_type') == 'PgpPrivateKeyBlock', 1 ).otherwise(0).alias('has_pgp_private_key'), when( col('material_type') == 'Pkcs8PrivateKey', 1 ).otherwise(0).alias('has_pkcs8_private_key'), when( col('material_type') == 'Pkcs12Certificate', 1 ).otherwise(0).alias('has_pkcs12_certificate'), when( col('material_type') == 'SSLPrivateKey', 1 ).otherwise(0).alias('has_ssl_private_key'), 'file_hash' ) vulnerable_crypto_materials_counts = vulnerable_crypto_materials_stats_dataframe.join( file_tree_dataframe, 'file_hash' ).groupBy( 'firmware_hash' ).agg( _sum('has_ssh_rsa_private_key').cast('int').alias('ssh_rsa_private_key_count'), _sum('has_ssh_rsa_public_key').cast('int').alias('ssh_rsa_public_key_count'), _sum('has_pgp_private_key').cast('int').alias('pgp_private_key_count'), _sum('has_pkcs8_private_key').cast('int').alias('pkcs8_private_key_count'), _sum('has_pkcs12_certificate').cast('int').alias('pkcs12_certificate_count'), _sum('has_ssl_private_key').cast('int').alias('ssl_private_key_count'), ).select( 'firmware_hash', 'ssh_rsa_private_key_count', 'ssh_rsa_public_key_count', 'pgp_private_key_count', 'pkcs8_private_key_count', 'pkcs12_certificate_count', 'ssl_private_key_count' ) # Checking col('file_full_path').contains('ssh_host') before the full regex lets Spark filter most of the # rows out without having to uncompress the data from Tungsten to apply the regex, which results in a significant # speedup. host_and_authorized_key_counts = file_tree_dataframe.select( 'file_hash', 'firmware_hash', 'file_full_path' ).distinct().select( 'firmware_hash', when( (col('file_full_path').contains('ssh_host') & col('file_full_path').rlike('.*ssh_host.*key')), 1 ).otherwise(0).alias('has_host_key'), when( col('file_full_path').endswith('authorized_keys') | col('file_full_path').endswith('authorized_keys2'), 1 ).otherwise(0).alias('has_authorized_key'), ).groupBy( 'firmware_hash' ).agg( _sum('has_host_key').cast('int').alias('host_keys_count'), _sum('has_authorized_key').cast('int').alias('authorized_keys_count') ).select( 'firmware_hash', 'host_keys_count', 'authorized_keys_count' ) crypto_materials_stats_df = vulnerable_crypto_materials_counts.join( host_and_authorized_key_counts, 'firmware_hash' ) # yapf: enable return crypto_materials_stats_df
def _demographics_transform(self): """ Class method to transform and aggregate demographics data, grouping by state and calculating gender and race ratios for each state. Returns: [dict] - object with source-name: SparkDF key-value pairs """ df = self.data_dict.get('demographics', None) if df is not None: data = df \ .groupBy( col("state_code").alias("state_code"), col("state") ).agg( _sum("total_population").alias("total_population"), _sum("male_population").alias("male_population"), _sum("female_population").alias("female_population"), _sum("American Indian and Alaska Native").alias("american_indian_and_alaska_native"), _sum("Asian").alias("asian"), _sum("Black or African-American").alias("black_or_african_american"), _sum("Hispanic or Latino").alias("hispanic_or_latino"), _sum("White").alias("white") ) \ .withColumn( "male_population_ratio", round( (col("male_population") / col("total_population")), 2 ) ) \ .withColumn( "female_population_ratio", round( (col("female_population") / col("total_population")), 2 ) ) \ .withColumn( "american_indian_and_alaska_native_ratio", round( (col("american_indian_and_alaska_native") / col("total_population")), 2 ) ) \ .withColumn( "asian_ratio", round( (col("asian") / col("total_population")), 2 ) ) \ .withColumn( "black_or_african_american_ratio", round( (col("black_or_african_american") / col("total_population")), 2 ) ) \ .withColumn( "hispanic_or_latino_ratio", round( (col("hispanic_or_latino") / col("total_population")), 2 ) ) \ .withColumn( "white_ratio", round( (col("white") / col("total_population")), 2 ) ) return dict(demographics=data) else: logger.error(ValueError('No dataset named "demographics" found in cleaned data dict.')) raise ValueError('No dataset named "demographics" found in cleaned data dict.')
def prepare_code_analysis_data(file_tree_dataframe: DataFrame, code_analysis_python_dataframe: DataFrame) -> DataFrame: return code_analysis_python_dataframe.groupBy( 'file_hash' ).agg( _sum( when( (col('issue_severity') == 'HIGH') & (col('issue_confidence') == 'HIGH'), 1 ).otherwise(0) ).alias('hs_hc'), _sum( when( (col('issue_severity') == 'HIGH') & (col('issue_confidence') == 'MEDIUM'), 1 ).otherwise(0) ).alias('hs_mc'), _sum( when( (col('issue_severity') == 'HIGH') & (col('issue_confidence') == 'LOW'), 1 ).otherwise(0) ).alias('hs_lc'), _sum( when( (col('issue_severity') == 'MEDIUM') & (col('issue_confidence') == 'HIGH'), 1 ).otherwise(0) ).alias('ms_hc'), _sum( when( (col('issue_severity') == 'MEDIUM') & (col('issue_confidence') == 'MEDIUM'), 1 ).otherwise(0) ).alias('ms_mc'), _sum( when( (col('issue_severity') == 'MEDIUM') & (col('issue_confidence') == 'LOW'), 1 ).otherwise(0) ).alias('ms_lc'), _sum( when( (col('issue_severity') == 'LOW') & (col('issue_confidence') == 'HIGH'), 1 ).otherwise(0) ).alias('ls_hc'), _sum( when( (col('issue_severity') == 'LOW') & (col('issue_confidence') == 'MEDIUM'), 1 ).otherwise(0) ).alias('ls_mc'), _sum( when( (col('issue_severity') == 'LOW') & (col('issue_confidence') == 'LOW'), 1 ).otherwise(0) ).alias('ls_lc'), ).withColumn( 'weighted_file_risk', (col('hs_hc') * 10) + (col('hs_mc') * 5) + (col('hs_lc') * 2) + (col('ms_hc') * 5) + (col('ms_mc') * 2.5) + (col('ms_lc') * 1) + (col('ls_hc') * 2) + (col('ls_mc') * 1) + (col('ls_lc') * 0.4) ).join( file_tree_dataframe, 'file_hash' ).groupBy( 'firmware_hash' ).agg( _sum( 'weighted_file_risk' ).cast('float').alias('total_weighted_file_risk'), _sum( 'hs_hc' ).cast('int').alias('high_severity_high_confidence'), _sum( 'hs_mc' ).cast('int').alias('high_severity_medium_confidence'), _sum( 'hs_lc' ).cast('int').alias('high_severity_low_confidence'), _sum( 'ms_hc' ).cast('int').alias('medium_severity_high_confidence'), _sum( 'ms_mc' ).cast('int').alias('medium_severity_medium_confidence'), _sum( 'ms_lc' ).cast('int').alias('medium_severity_low_confidence'), _sum( 'ls_hc' ).cast('int').alias('low_severity_high_confidence'), _sum( 'ls_mc' ).cast('int').alias('low_severity_medium_confidence'), _sum( 'ls_lc' ).cast('int').alias('low_severity_low_confidence') )
def useSpark(sourceFile: str, targetTsvFile: str) -> None: """[Process the input source files using Spark to transform to target data] Args: sourceFile (str): [Path to the location of the input data] targetTsvFile (str): [Path to the location of the target data] """ # secrets for access to postgres database are held in .env file # this loads that into the application environment load_dotenv(verbose=True) spark = SparkSession.builder \ .appName('Aquis2') \ .master("local[2]") \ .config(conf=getSparkConf(getJars())) \ .getOrCreate() # clean data from source file cleanDf = spark.read.text(sourceFile) \ .filter(col("value").contains("msgType_") & ~col("value").contains('msgType_":11')) \ .withColumn("value", expr("substring(value,2)")) \ .withColumn("value", regexp_replace("value", '\{\{', r'\{"header":\{')) \ .withColumn("value", regexp_replace("value", 'SELL,', '"SELL",')) \ .withColumn("value", regexp_replace("value", 'BUY,', '"BUY",')) \ .withColumn("value", regexp_replace("value", '"flags_":"\{"', '"flags_":\{"')) # figure out schema on message 8, keep for re-use later as a technology demonstration msg8Schema = spark.read.json( cleanDf.filter(col("value").contains('"msgType_":8')).select( col("value").cast("string")).rdd.map( lambda r: r.value))._jdf.schema().toDDL() msg8Df = cleanDf.filter(col("value").contains('"msgType_":8')).withColumn("value", from_json("value", msg8Schema)) \ .select("value.security_.securityId_", "value.security_.isin_", "value.security_.currency_") \ .repartition(2, ["securityId_"]) # msg8Df.printSchema() # root # | -- securityId_: long(nullable=true) # | -- isin_: string(nullable=true) # | -- currency_: string(nullable=true) # figure out schema on message 12, keep for re-use later as a technology demonstration msg12Schema = spark.read.json( cleanDf.filter(col("value").contains('"msgType_":12')).select( col("value").cast("string")).rdd.map( lambda r: r.value))._jdf.schema().toDDL() msg12Df = cleanDf.filter(col("value").contains('"msgType_":12')) \ .withColumn("value", from_json("value", msg12Schema)) \ .repartition(2, ["value.bookEntry_.securityId_"]) # msg12Df.printSchema() # msg12Df.select("value.bookEntry_.side_").show() # root # | -- value: struct(nullable=true) # | | -- bookEntry_: struct(nullable=true) # | | | -- orderId_: long(nullable=true) # | | | -- price_: long(nullable=true) # | | | -- quantity_: long(nullable=true) # | | | -- securityId_: long(nullable=true) # | | | -- side_: string(nullable=true) # | | -- header: struct(nullable=true) # | | | -- length_: long(nullable=true) # | | | -- msgType_: long(nullable=true) # | | | -- seqNo_: long(nullable=true) # now aggregate messageType12 by securityId_ and side_ aggDfSells = msg12Df.filter("value.bookEntry_.side_ == 'SELL'") \ .select("*", (col("value.bookEntry_.quantity_") * col("value.bookEntry_.price_")).alias("TotalSellAmount")) \ .groupby("value.bookEntry_.securityId_") \ .agg(count("value.bookEntry_.securityId_").alias("Total Sell Count"), _sum("value.bookEntry_.quantity_").alias("Total Sell Quantity"), _min("value.bookEntry_.price_").alias("Min Sell Price"), _sum("TotalSellAmount").alias("Weighted Average Sell Price") ) \ .withColumn("Weighted Average Sell Price", col("Weighted Average Sell Price") / col("Total Sell Quantity")) # now aggregate messageType12 by securityId_ and side_ aggDfBuys = msg12Df.filter("value.bookEntry_.side_ == 'BUY'") \ .select("*", (col("value.bookEntry_.quantity_") * col("value.bookEntry_.price_")).alias("TotalBuyAmount")) \ .groupby("value.bookEntry_.securityId_") \ .agg(count("value.bookEntry_.securityId_").alias("Total Buy Count"), _sum("value.bookEntry_.quantity_").alias("Total Buy Quantity"), _max("value.bookEntry_.price_").alias("Max Buy Price"), _sum("TotalBuyAmount").alias("Weighted Average Buy Price")) \ .withColumn("Weighted Average Buy Price", col("Weighted Average Buy Price") / col("Total Buy Quantity")) # bring it together with joins, use outer join with the security data due to missing ids # select columns in the following order.. outputColList = [ col("isin_").alias("ISIN"), col("currency_").alias("Currency"), "Total Buy Count", "Total Sell Count", "Total Buy Quantity", "Total Sell Quantity", "Weighted Average Buy Price", "Weighted Average Sell Price", "Max Buy Price", "Min Sell Price" ] outputDf = aggDfBuys.join(aggDfSells, ["securityId_"], "full_outer") \ .join(msg8Df, ["securityId_"], "left_outer") \ .na.fill(0, outputColList[2:]) \ .na.fill("MISSING", ["isin_", "currency_"]) \ .select(outputColList) # collect into a single file outputDf.coalesce(1).write.option("sep", "\t").csv(targetTsvFile, header=True) # Demo writing to postgresql (msg8 dataframe) # will append records to table AcquisExample. Table will # be created on the fly it it does not exist. dburl = getDbConnectionUrl(db=os.getenv("POSTGRES_DB"), user=os.getenv("POSTGRES_USER"), secret=os.getenv("POSTGRES_SECRET")) msg8Df.write.format("jdbc") \ .option("url", dburl) \ .option("dbtable", "AcquisExample") \ .option("driver", "org.postgresql.Driver") \ .save(mode="append") spark.stop()
def main( output_folder="./www/stepchain", start_date=None, end_date=None, last_n_days=15, ): """Get step data in wmarchive. Each step array contains multiple steps. Udf function returns each step as a separate row in a list. flatMap helps to flat list of steps to become individual rows in dataframe. """ # Borrowed logic from condor_cpu_efficiency _yesterday = datetime.combine(date.today() - timedelta(days=1), datetime.min.time()) if not (start_date or end_date): end_date = _yesterday start_date = end_date - timedelta(days=last_n_days) elif not start_date: start_date = end_date - timedelta(days=last_n_days) elif not end_date: end_date = min(start_date + timedelta(days=last_n_days), _yesterday) if start_date > end_date: raise ValueError( f"start date ({start_date}) should be earlier than end date({end_date})" ) spark = get_spark_session() df_raw = spark.read.option("basePath", _DEFAULT_HDFS_FOLDER).json( get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER) ) \ .select(["data.*", "metadata.timestamp"]) \ .filter( f"""data.meta_data.jobstate='success' AND data.meta_data.jobtype='Production' AND data.wmats >= {start_date.timestamp()} AND data.wmats < {end_date.timestamp()} """ ) df_rdd = df_raw.rdd.flatMap(lambda r: udf_step_extract(r)) df = spark.createDataFrame(df_rdd, schema=get_schema()).dropDuplicates().where( _col("ncores").isNotNull()).cache() df_details = df.groupby(["task", "site", "step_name"]).agg( (100 * (_sum("jobCPU") / _mean("nthreads")) / _sum("jobTime")).alias("avg_cpueff"), _count(lit(1)).alias("#jobs"), _mean("steps_len").alias("#steps"), _mean("nthreads").alias("#nthreads"), _mean("ncores").alias("#ncores"), (_sum("jobCPU") / _count(lit(1))).alias("avg_jobCPU"), (_sum("jobTime") / _count(lit(1))).alias("avg_jobTime"), _collect_set("acquisitionEra").alias("acquisitionEra"), ).withColumn("avg_cpueff", _col("avg_cpueff").cast(IntegerType())).toPandas() df_task = df.groupby(["task"]).agg( (100 * (_sum("jobCPU") / _mean("nthreads")) / _sum("jobTime")).alias("avg_cpueff"), _count(lit(1)).alias("#jobs"), _mean("steps_len").alias("#steps"), _mean("nthreads").alias("#nthreads"), _mean("ncores").alias("#ncores"), (_sum("jobCPU") / _count(lit(1))).alias("avg_jobCPU"), (_sum("jobTime") / _count(lit(1))).alias("avg_jobTime"), ).withColumn("avg_cpueff", _col("avg_cpueff").cast(IntegerType())).toPandas() write_htmls(df_details, df_task, start_date, end_date, output_folder)