示例#1
0
def fetch_classads_nanoaod(hdfs_path):
    """Fetch HDFS ClassAds records from a particular path"""
    # DBS dataset info
    csvreader = spark.read.format("com.databricks.spark.csv").option(
        "nullValue", "null").option("mode", "FAILFAST")
    # Path where the input files are
    basepath = "/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current"
    # Get the information about Blocks so that we can map the block name of the job to the block id
    dbs_datasets = csvreader.schema(
        schemas.schema_datasets()).load(basepath + "/DATASETS")

    # Read input file
    jobreports = spark.read.json(hdfs_path)

    # The following regexps describe what is in the cache
    regexp1 = "/*/Run2016.*-03Feb2017.*/NANOAOD"
    regexp2 = "/*/RunIISummer16MiniAODv2-PUMoriond17_80X_.*/NANOAODSIM"
    regexp3 = "/*/.*-31Mar2018.*/NANOAOD"
    regexp4 = "/*/.*RunIIFall17NanoAODv2.*/NANOAODSIM"

    # Desired sites
    sites = ["T2_US_UCSD", "T2_US_Caltech", "T3_US_UCR"]

    df = (jobreports
            # Joing dataset DBS table with jobreports
            .join(dbs_datasets, col('data.DESIRED_CMSDataset')==col('d_dataset'))
            # Require datasets from cache
            .filter(
                 col('d_dataset').rlike(regexp1) |
                 col('d_dataset').rlike(regexp2) |
                 col('d_dataset').rlike(regexp3) |
                 col('d_dataset').rlike(regexp4)
             )
            # Require at UCSD, Caltech, or UCR
            .filter(col('data.CMSSite').isin(sites))
            # Require CMS jobs
            .filter(col('data.VO') == "cms")
            # Require analysis jobs
            .filter(col('data.Type') == 'analysis')
            # Require completed jobs
            .filter(col('data.Status') == 'Completed')
            # There are other utility CRAB jobs we don't want to read
            .filter(col('data.JobUniverse') == 5)
            # Select columns to save
            .select(
                 col('data.CRAB_Workflow').alias('workflow_id'),
                 col('data.CRAB_Id').alias('crab_id'),
                 col('data.CRAB_Retry').alias('num_retries'),
                 col('data.ScheddName').alias('schedd_name'),
                 col('data.CRAB_UserHN').alias('user_hn'),
                 col('data.CoreHr').alias('walltime'),
                 col('data.CpuTimeHr').alias('cpu_time'),
                 col('data.ExitCode').alias('exit_code'),
                 col('data.RequestCpus').alias('num_cpus'),
                 col('data.ChirpCMSSWReadBytes').alias('read_bytes')
             )
        )

    print("[script] Fetched {}".format(hdfs_path))
    return df
示例#2
0
 def __init__(self, out):
     self.out = out
     conf = SparkConf().setMaster("yarn").setAppName("CMS Working Set")
     sc = SparkContext(conf=conf)
     self.spark = SparkSession(sc)
     avroreader = self.spark.read.format("com.databricks.spark.avro")
     csvreader  =  self.spark.read.format("com.databricks.spark.csv").option("nullValue","null").option("mode","FAILFAST")
     ## check if phedex_path exist or not on hdfs area and you can directly assign it
     phedex_path = "/project/awg/cms/phedex/block-replicas-snapshots/csv/time=" + str((date.today() - timedelta(days=2)).strftime("%Y-%m-%d")) + "_*/part-m-00000"
     self.phedex_block_replicas = csvreader.schema(schemas.schema_phedex()).load(phedex_path)
     self.dbs_files = csvreader.schema(schemas.schema_files()).load("/project/awg/cms/CMS_DBS3_PROD_GLOBAL/new/FILES/part-m-00000")
     self.dbs_blocks = csvreader.schema(schemas.schema_blocks()).load("/project/awg/cms/CMS_DBS3_PROD_GLOBAL/new/BLOCKS/part-m-00000")
     self.dbs_datasets = csvreader.schema(schemas.schema_datasets()).load("/project/awg/cms/CMS_DBS3_PROD_GLOBAL/new/DATASETS/part-m-00000")
示例#3
0
def fileMismatch(args):
    conf = SparkConf().setMaster("yarn").setAppName("CMS Working Set")
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)
    print(
        "Initiated spark session on yarn, web URL: http://ithdp1101.cern.ch:8088/proxy/%s"
        % sc.applicationId)

    avroreader = spark.read.format("com.databricks.spark.avro")
    csvreader = spark.read.format("com.databricks.spark.csv").option(
        "nullValue", "null").option("mode", "FAILFAST")
    dbs_files = csvreader.schema(schemas.schema_files()).load(
        "/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/FILES/part-m-00000")
    dbs_datasets = csvreader.schema(schemas.schema_datasets()).load(
        "/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/DATASETS/part-m-00000")

    current = time.time()
    past_n_days = args.days
    delta_t = current - past_n_days * 60 * 60 * 24
    delta_t_str = str(delta_t)
    delta_t = delta_t_str[:10]

    if args.out_path:
        mismatch_df = (
            dbs_files.filter(col('f_is_file_valid') == '0').filter(
                col('f_last_modification_date') > delta_t).join(
                    dbs_datasets,
                    col('f_dataset_id') == col('d_dataset_id')).filter(
                        (col('d_dataset_access_type_id') == '1')
                        | (col('d_dataset_access_type_id') == '41')).
            filter(col('f_logical_file_name').isNotNull()).where(
                ~(dbs_files.f_last_modified_by.contains('dmielaik')
                  | dbs_files.f_last_modified_by.contains('ogarzonm'))).select(
                      'd_dataset', 'f_last_modified_by',
                      'f_logical_file_name').distinct())
        mismatch_df.select('f_logical_file_name',
                           'f_last_modified_by').repartition(1).write.format(
                               "com.databricks.spark.csv").option(
                                   "header", "true").save(args.out_path)
        mismatch_df.groupby('d_dataset').agg(
            fn.count(fn.col("f_logical_file_name")).alias(
                'extra_lfn_phedex')).show()
inputfile = "/project/monitoring/archive/condor/raw/metric/" + year + "/" + month + "/*/*.json.gz"
outputfile = "hdfs://analytix/user/ddavila/model/data_tier_days_" + year + month + ".parquet"

print("===========================================================")
print("reading: " + inputfile)
print("writing: " + outputfile)
print("===========================================================")

conf = SparkConf().setMaster("yarn").setAppName("CMS Working Set")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

# Get information from DBS about datatsets IDs
csvreader = spark.read.format("com.databricks.spark.csv").option(
    "nullValue", "null").option("mode", "FAILFAST")
dbs_datasets = csvreader.schema(schemas.schema_datasets()).load(
    "/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/DATASETS/part-m-00000")
dbs_data_tiers = csvreader.schema(schemas.schema_data_tiers()).load(
    "/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/DATA_TIERS/part-m-00000")

schema = types.StructType([
    types.StructField(
        "data",
        types.StructType([
            types.StructField("Status", types.StringType(), True),
            types.StructField("Type", types.StringType(), True),
            types.StructField("JobUniverse", types.StringType(), True),
            types.StructField("DESIRED_CMSDataset", types.StringType(), True),
            types.StructField("RecordTime", types.LongType(), True),
        ]), False),
])
示例#5
0
def run(args):
    conf = SparkConf().setMaster("yarn").setAppName("CMS Working Set")
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)
    print(
        "Initiated spark session on yarn, web URL: http://ithdp1101.cern.ch:8088/proxy/%s"
        % sc.applicationId)

    csvreader = (spark.read.format("csv").option("nullValue", "null").option(
        "mode", "FAILFAST"))
    dbs_files = csvreader.schema(schemas.schema_files()).load(
        "/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/FILES/part-m-00000")
    dbs_datasets = (csvreader.schema(schemas.schema_datasets()).load(
        "/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/DATASETS/part-m-00000"
    ).withColumn(
        "input_campaign",
        fn.regexp_extract(
            col("d_dataset"),
            r"^/[^/]*/((?:HI|PA|PN|XeXe|)Run201\d\w-[^-]+|CMSSW_\d+|[^-]+)[^/]*/",
            1,
        ),
    ))

    if args.source == "classads":
        working_set_day = (get_df_condor(spark, args.dates).withColumn(
            "day", (col("timestamp") - col("timestamp") % fn.lit(86400))).join(
                dbs_datasets,
                col("dataset_name") == col("d_dataset")).groupBy(
                    "day", "input_campaign", "d_data_tier_id").agg(
                        fn.collect_set("d_dataset_id").alias("working_set"), ))
        working_set_day.write.parquet(args.out)
    elif args.source == "cmssw":
        working_set_day = (get_df_cmssw(spark, args.dates).withColumn(
            "day", (col("timestamp") - col("timestamp") % fn.lit(86400))).join(
                dbs_files,
                col("file_lfn") == col("f_logical_file_name")).join(
                    dbs_datasets,
                    col("f_dataset_id") == col("d_dataset_id")).groupBy(
                        "day", "input_campaign", "d_data_tier_id", "site_name",
                        "is_crab").agg(
                            fn.collect_set("f_block_id").alias(
                                "working_set_blocks"), ))
        working_set_day.write.parquet(args.out)
    elif args.source == "xrootd":
        working_set_day = (get_df_xrootd(spark, args.dates).withColumn(
            "day", (col("timestamp") - col("timestamp") % fn.lit(86400))).join(
                dbs_files,
                col("file_lfn") == col("f_logical_file_name")).join(
                    dbs_datasets,
                    col("f_dataset_id") == col("d_dataset_id")).groupBy(
                        "day", "input_campaign", "d_data_tier_id",
                        "client_domain").agg(
                            fn.collect_set("f_block_id").alias(
                                "working_set_blocks"), ))
        working_set_day.write.parquet(args.out)
    elif args.source == "fwjr":
        working_set_day = (get_df_wmarchive(spark, args.dates).withColumn(
            "day", (col("timestamp") - col("timestamp") % fn.lit(86400))).join(
                dbs_files,
                col("file_lfn") == col("f_logical_file_name")).join(
                    dbs_datasets,
                    col("f_dataset_id") == col("d_dataset_id")).groupBy(
                        "day", "input_campaign", "d_data_tier_id",
                        "site_name").agg(
                            fn.collect_set("f_block_id").alias(
                                "working_set_blocks"), ))
        working_set_day.write.parquet(args.out)