def run(date, fout, yarn=None, verbose=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:]) todate = fromdate # read Phedex tables tables = {} tables.update( phedex_tables(sqlContext, verbose=verbose, fromdate=fromdate, todate=todate)) phedex_df = tables['phedex_df'] # register user defined function unix2date = udf(unix2human, StringType()) siteFilter = udf(site_filter, IntegerType()) one_day = 60 * 60 * 24 # aggregate phedex info into dataframe cols = [ 'node_name', 'dataset_name', 'block_bytes', 'replica_time_create', 'br_user_group_id' ] pdf = phedex_df.select(cols).where(siteFilter(col('node_name')) == 1)\ .groupBy(['node_name', 'dataset_name', 'replica_time_create', 'br_user_group_id'])\ .agg({'block_bytes':'sum'})\ .withColumn('date', lit(date))\ .withColumn('replica_date', unix2date(col('replica_time_create')))\ .withColumnRenamed('sum(block_bytes)', 'size')\ .withColumnRenamed('dataset_name', 'dataset')\ .withColumnRenamed('node_name', 'site')\ .withColumnRenamed('br_user_group_id', 'groupid') pdf.registerTempTable('pdf') pdf.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: year, month, day = split_date(date) out = '%s/%s/%s/%s' % (fout, year, month, day) cols = ['date', 'site', 'dataset', 'size', 'replica_date', 'groupid'] # don't write header since when we'll read back the data it will # mismatch the data types, i.e. headers are string and rows # may be different data types pdf.select(cols)\ .write.format("com.databricks.spark.csv")\ .option("header", "true").save(out) ctx.stop()
def run(fout, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = SQLContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) tables.update(phedex_tables(sqlContext, verbose=verbose)) phedex_df = tables['phedex_df'] ddf = tables['ddf'] fdf = tables['fdf'] print("### ddf from main", ddf) # join tables cols = ['*'] # to select all fields from table cols = ['d_dataset_id', 'd_dataset', 'f_logical_file_name', 'f_adler32'] # join tables stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id' % ','.join( cols) print(stmt) joins = sqlContext.sql(stmt) # construct conditions adler = [ 'ad8f6ad2', '9c441343', 'f68d5dca', '81c90e2a', '471d2524', 'a3c1f077', '6f0018a0', '8bb03b60', 'd504882c', '5ede357f', 'b05303c3', '716d1776', '7e9cf258', '1945804b', 'ec7bc1d7', '12c87747', '94f2aa32' ] cond = 'f_adler32 in %s' % adler cond = cond.replace('[', '(').replace(']', ')') # scols = ['f_logical_file_name'] fjoin = joins.where(cond).distinct().select(cols) print_rows(fjoin, stmt, verbose) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: fjoin.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(fout, date, yarn=None, verbose=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) sqlContext = HiveContext(ctx) fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:]) todate = fromdate # read Phedex and DBS tables tables = {} tables.update( phedex_tables(sqlContext, verbose=verbose, fromdate=fromdate, todate=todate)) phedex = tables['phedex_df'] instances = [inst] if inst == 'all': instances = ['GLOBAL', 'PHYS01', 'PHYS02', 'PHYS03'] for instance in instances: dbs_dict = dbs_tables(sqlContext, inst=instance, verbose=verbose) for key, val in dbs_dict.items(): new_key = '%s_%s' % (key, instance) tables[new_key] = val daf = reduce(lambda a, b: a.unionAll(b), [tables['daf_%s' % x] for x in instances]) ddf = reduce(lambda a, b: a.unionAll(b), [tables['ddf_%s' % x] for x in instances]) fdf = reduce(lambda a, b: a.unionAll(b), [tables['fdf_%s' % x] for x in instances]) dbs_fdf_cols = ['f_dataset_id', 'f_file_size'] dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id'] dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type'] fdf_df = fdf.select(dbs_fdf_cols) ddf_df = ddf.select(dbs_ddf_cols) daf_df = daf.select(dbs_daf_cols) # Aggregate by campaign and find total PhEDEx and DBS size of each campaign # d_dataset_id, d_dataset, dataset_access_type dbs_df = ddf_df.join(daf_df, ddf_df.d_dataset_access_type_id == daf_df.dataset_access_type_id)\ .drop(ddf_df.d_dataset_access_type_id)\ .drop(daf_df.dataset_access_type_id) # dataset, dbs_size dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\ .join(fdf_df, dbs_df.d_dataset_id == fdf_df.f_dataset_id)\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumnRenamed('f_file_size', 'dbs_size')\ .drop(dbs_df.d_dataset_id)\ .drop(fdf_df.f_dataset_id)\ .drop(dbs_df.dataset_access_type) # dataset, dbs_size dbs_df = dbs_df.groupBy(['dataset'])\ .agg({'dbs_size':'sum'})\ .withColumnRenamed('sum(dbs_size)', 'dbs_size') # dataset, site, phedex_size phedex_cols = ['dataset_name', 'block_bytes', 'node_name'] phedex_df = phedex.select(phedex_cols) phedex_df = phedex_df.withColumnRenamed('block_bytes', 'phedex_size')\ .withColumnRenamed('dataset_name', 'dataset')\ .withColumnRenamed('node_name', 'site') # dataset, sites, phedex_size phedex_df = phedex_df.groupBy(['dataset'])\ .agg({'phedex_size':'sum', 'site': 'collect_set'})\ .withColumnRenamed('sum(phedex_size)', 'phedex_size')\ .withColumnRenamed('collect_set(site)', 'sites') # Subtract to get leftovers extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2]) # dataset leftover_datasets_df = phedex_df.select('dataset').subtract( dbs_df.select('dataset')) # dataset, campaign, sites, phedex_size leftovers_df = leftover_datasets_df.select('dataset').join( phedex_df, 'dataset') #.join(dbs_df, 'dataset') leftovers_df = leftovers_df.withColumn('campaign', extract_campaign_udf(leftovers_df.dataset))\ .select(['dataset', 'campaign', 'sites', 'phedex_size']) # Subtract to get leftovers that don't even exist in DBS (orphans) ddf_datasets_df = ddf_df.withColumnRenamed('d_dataset', 'dataset').select('dataset') leftover_datasets_df = phedex_df.select('dataset').subtract( ddf_datasets_df) # dataset, campaign, sites, phedex_size leftovers_orphans_df = leftover_datasets_df.select('dataset').join( phedex_df, 'dataset') #.join(dbs_df, 'dataset') leftovers_orphans_df = leftovers_orphans_df.withColumn('campaign', extract_campaign_udf(leftovers_orphans_df.dataset))\ .select(['dataset', 'campaign', 'sites', 'phedex_size']) # Sum total size of leftovers all_leftovers_size = leftovers_df.select( 'phedex_size').groupBy().sum().rdd.map(lambda x: x[0]).collect()[0] orphan_leftovers_size = leftovers_orphans_df.select( 'phedex_size').groupBy().sum().rdd.map(lambda x: x[0]).collect()[0] print 'All leftovers PhEDEx size: %s' % bytes_to_readable( all_leftovers_size) print 'Orphan leftovers PhEDEx size: %s' % bytes_to_readable( orphan_leftovers_size) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: leftovers_df.write.format("com.databricks.spark.csv")\ .option("header", "true").save('%s/all' % fout) leftovers_orphans_df.write.format("com.databricks.spark.csv")\ .option("header", "true").save('%s/orphans' % fout) ctx.stop()
def run(fout, date, yarn=None, verbose=None, inst='GLOBAL', limit=100): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) sqlContext = HiveContext(ctx) fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:]) todate = fromdate # read Phedex and DBS tables tables = {} tables.update(phedex_tables(sqlContext, verbose=verbose, fromdate=fromdate, todate=todate)) phedex = tables['phedex_df'] instances = ['GLOBAL'] # , 'PHYS01', 'PHYS02', 'PHYS03' for instance in instances: dbs_dict = dbs_tables(sqlContext, inst=instance, verbose=verbose) for key, val in dbs_dict.items(): new_key = '%s_%s' % (key, instance) tables[new_key] = val daf = reduce(lambda a,b: a.unionAll(b), [tables['daf_%s' % x] for x in instances]) ddf = reduce(lambda a,b: a.unionAll(b), [tables['ddf_%s' % x] for x in instances]) fdf = reduce(lambda a,b: a.unionAll(b), [tables['fdf_%s' % x] for x in instances]) dbs_fdf_cols = ['f_dataset_id', 'f_file_size'] dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id'] dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type'] fdf_df = fdf.select(dbs_fdf_cols) ddf_df = ddf.select(dbs_ddf_cols) daf_df = daf.select(dbs_daf_cols) # d_dataset_id, d_dataset, dataset_access_type dbs_df = ddf_df.join(daf_df, ddf_df.d_dataset_access_type_id == daf_df.dataset_access_type_id)\ .drop(ddf_df.d_dataset_access_type_id)\ .drop(daf_df.dataset_access_type_id) # dataset, dbs_size dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\ .join(fdf_df, dbs_df.d_dataset_id == fdf_df.f_dataset_id)\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumnRenamed('f_file_size', 'dbs_size')\ .drop(dbs_df.d_dataset_id)\ .drop(fdf_df.f_dataset_id)\ .drop(dbs_df.dataset_access_type) # dataset, dbs_size dbs_df = dbs_df.groupBy(['dataset'])\ .agg({'dbs_size':'sum'})\ .withColumnRenamed('sum(dbs_size)', 'dbs_size') # dataset_name, block_bytes, node_name phedex_cols = ['dataset_name', 'block_bytes', 'node_name'] phedex_all_df = phedex.select(phedex_cols) aggregate(sqlContext, fout, phedex_all_df, dbs_df, limit) aggregate(sqlContext, fout, phedex_all_df, dbs_df, limit, disk_only=True) ctx.stop()
def run(fout, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) tables.update(phedex_tables(sqlContext, verbose=verbose)) phedex_df = tables['phedex_df'] daf = tables['daf'] ddf = tables['ddf'] bdf = tables['bdf'] fdf = tables['fdf'] aef = tables['aef'] pef = tables['pef'] mcf = tables['mcf'] ocf = tables['ocf'] rvf = tables['rvf'] print("### ddf from main", ddf) # aggregate phedex info into dataframe phedex_cols = [ 'node_name', 'dataset_name', 'dataset_is_open', 'block_bytes', 'replica_time_create' ] newpdf = phedex_df.select(phedex_cols).groupBy(['node_name', 'dataset_name', 'dataset_is_open'])\ .agg({'block_bytes':'sum', 'replica_time_create':'max'})\ .withColumnRenamed('sum(block_bytes)', 'pbr_size')\ .withColumnRenamed('max(replica_time_create)', 'max_replica_time') newpdf.registerTempTable('newpdf') # print_rows(newpdf, 'newpdf', verbose) # newpdf.persist(StorageLevel.MEMORY_AND_DISK) # join tables cols = ['*'] # to select all fields from table cols = [ 'd_dataset_id', 'd_dataset', 'd_creation_date', 'd_is_dataset_valid', 'f_event_count', 'f_file_size', 'dataset_access_type', 'acquisition_era_name', 'processing_version' ] # join tables stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id JOIN daf ON ddf.d_dataset_access_type_id = daf.dataset_access_type_id JOIN aef ON ddf.d_acquisition_era_id = aef.acquisition_era_id JOIN pef ON ddf.d_processing_era_id = pef.processing_era_id' % ','.join( cols) print(stmt) joins = sqlContext.sql(stmt) # print_rows(joins, 'joins', verbose) # keep joins table around # joins.persist(StorageLevel.MEMORY_AND_DISK) # construct conditions cond = 'dataset_access_type = "VALID" AND d_is_dataset_valid = 1' fjoin = joins.where(cond).distinct().select(cols) # at this step we have fjoin table with Row(d_dataset_id=9413359, d_dataset=u'/SingleMu/CMSSW_7_1_0_pre9-GR_R_71_V4_RelVal_mu2012D_TEST-v6000/DQM', d_creation_date=1406060166.0, d_is_dataset_valid=1, f_event_count=5318, f_file_size=21132638.0, dataset_access_type=u'DELETED', acquisition_era_name=u'CMSSW_7_1_0_pre9', processing_version=u'6000')) newdf = fjoin\ .groupBy(['d_dataset','d_dataset_id','dataset_access_type','acquisition_era_name','processing_version'])\ .agg({'f_event_count':'sum', 'f_file_size':'sum', 'd_creation_date':'max'})\ .withColumnRenamed('sum(f_event_count)', 'evts')\ .withColumnRenamed('sum(f_file_size)', 'size')\ .withColumnRenamed('max(d_creation_date)', 'date') # at this point we have ndf dataframe with our collected stats for every dataset # let's join it with release info newdf.registerTempTable('newdf') cols = [ 'd_dataset_id', 'd_dataset', 'evts', 'size', 'date', 'dataset_access_type', 'acquisition_era_name', 'processing_version', 'r_release_version' ] stmt = 'SELECT %s FROM newdf JOIN mcf ON newdf.d_dataset_id = mcf.mc_dataset_id JOIN ocf ON mcf.mc_output_mod_config_id = ocf.oc_output_mod_config_id JOIN rvf ON ocf.oc_release_version_id = rvf.r_release_version_id' % ','.join( cols) agg_dbs_df = sqlContext.sql(stmt) agg_dbs_df.registerTempTable('agg_dbs_df') # print_rows(agg_dbs_df, 'agg_dbs_df', verbose) # keep agg_dbs_df table around # agg_dbs_df.persist(StorageLevel.MEMORY_AND_DISK) # join dbs and phedex tables # cols = ['d_dataset_id','d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','processing_version','r_release_version','dataset_name','node_name','pbr_size','dataset_is_open','max_replica_time'] cols = [ 'd_dataset', 'evts', 'size', 'date', 'dataset_access_type', 'acquisition_era_name', 'r_release_version', 'node_name', 'pbr_size', 'dataset_is_open', 'max_replica_time' ] stmt = 'SELECT %s FROM agg_dbs_df JOIN newpdf ON agg_dbs_df.d_dataset = newpdf.dataset_name' % ','.join( cols) finaldf = sqlContext.sql(stmt) # keep agg_dbs_df table around # finaldf.persist(StorageLevel.MEMORY_AND_DISK) # print_rows(finaldf, stmt, verbose) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: ndf = split_dataset(finaldf, 'd_dataset') ndf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(fout, date, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) sqlContext = HiveContext(ctx) fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:]) todate = fromdate # read Phedex and DBS tables tables = {} tables.update( phedex_tables(sqlContext, verbose=verbose, fromdate=fromdate, todate=todate)) phedex = tables['phedex_df'] instances = ['GLOBAL', 'PHYS01', 'PHYS02', 'PHYS03'] for instance in instances: dbs_dict = dbs_tables(sqlContext, inst=instance, verbose=verbose) for key, val in dbs_dict.items(): new_key = '%s_%s' % (key, instance) tables[new_key] = val daf = reduce(lambda a, b: a.unionAll(b), [tables['daf_%s' % x] for x in instances]) ddf = reduce(lambda a, b: a.unionAll(b), [tables['ddf_%s' % x] for x in instances]) fdf = reduce(lambda a, b: a.unionAll(b), [tables['fdf_%s' % x] for x in instances]) dbs_fdf_cols = ['f_dataset_id', 'f_file_size'] dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id'] dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type'] fdf_df = fdf.select(dbs_fdf_cols) ddf_df = ddf.select(dbs_ddf_cols) daf_df = daf.select(dbs_daf_cols) # Aggregate by campaign and find total PhEDEx and DBS size of each campaign extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2]) # dataset, size, dataset_access_type_id dbs_df = fdf_df.join(ddf_df, fdf_df.f_dataset_id == ddf_df.d_dataset_id)\ .drop('f_dataset_id')\ .drop('d_dataset_id')\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumnRenamed('f_file_size', 'size')\ .withColumnRenamed('d_dataset_access_type_id', 'dataset_access_type_id') # dataset, size, dataset_access_type dbs_df = dbs_df.join(daf_df, dbs_df.dataset_access_type_id == daf_df.dataset_access_type_id)\ .drop(dbs_df.dataset_access_type_id)\ .drop(daf_df.dataset_access_type_id) # campaign, dbs_size dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\ .withColumn('campaign', extract_campaign_udf(dbs_df.dataset))\ .groupBy(['campaign'])\ .agg({'size':'sum'})\ .withColumnRenamed('sum(size)', 'dbs_size')\ .drop('dataset') # campaign, phedex_size phedex_cols = ['dataset_name', 'block_bytes'] phedex_df = phedex.select(phedex_cols) phedex_df = phedex_df.withColumn('campaign', extract_campaign_udf(phedex_df.dataset_name))\ .groupBy(['campaign'])\ .agg({'block_bytes':'sum'})\ .withColumnRenamed('sum(block_bytes)', 'phedex_size') # campaign, dbs_size, phedex_size dbs_phedex_df = dbs_df.join(phedex_df, dbs_df.campaign == phedex_df.campaign)\ .drop(dbs_df.campaign) print 'DISTINCT DBS AND PHEDEX CAMPAIGN COUNT:' print dbs_phedex_df.select('campaign').distinct().count() # Select campaign - site pairs and their sizes (from PhEDEx) # campaign, site, size phedex_cols = ['dataset_name', 'node_name', 'block_bytes'] campaign_site_df = phedex.select(phedex_cols) campaign_site_df = campaign_site_df.withColumn('campaign', extract_campaign_udf(campaign_site_df.dataset_name))\ .groupBy(['campaign', 'node_name'])\ .agg({'block_bytes':'sum'})\ .withColumnRenamed('sum(block_bytes)', 'size')\ .withColumnRenamed('node_name', 'site') # Aggregate data for site - campaign count table # site, count site_campaign_count_df = campaign_site_df.groupBy(['site'])\ .agg(countDistinct('campaign'))\ .withColumnRenamed('count(campaign)', 'campaign_count')\ .orderBy('campaign_count', ascending=False)\ .limit(LIMIT) # Find two most significant sites for each campaign columns_before_pivot = campaign_site_df.columns result = campaign_site_df.groupBy(['campaign'])\ .pivot('site')\ .sum('size')\ .na.fill(0) columns_after_pivot = result.columns sites_columns = [ x for x in columns_after_pivot if x not in columns_before_pivot ] number_of_sites_udf = udf(lambda row: len([x for x in row if x != 0]), IntegerType()) mss_udf = udf(get_mss, LongType()) second_mss_udf = udf(get_second_mss, LongType()) mss_name_udf = udf(lambda row: get_mss_name(row, sites_columns), StringType()) second_mss_name_udf = udf( lambda row: get_second_mss_name(row, sites_columns), StringType()) result = result.withColumn('sites', number_of_sites_udf(struct([result[x] for x in sites_columns])))\ .withColumn('mss', mss_udf(struct([result[x] for x in sites_columns])))\ .withColumn('mss_name', mss_name_udf(struct([result[x] for x in sites_columns])))\ .withColumn('second_mss', second_mss_udf(struct([result[x] for x in sites_columns])))\ .withColumn('second_mss_name', second_mss_name_udf(struct([result[x] for x in sites_columns]))) # campaign, phedex_size, dbs_size, mss, mss_name, second_mss, second_mss_name, sites result = result.join(dbs_phedex_df, result.campaign == dbs_phedex_df.campaign)\ .drop(result.campaign) sorted_by_phedex = result.orderBy(result.phedex_size, ascending=False).limit(LIMIT) sorted_by_dbs = result.orderBy(result.dbs_size, ascending=False).limit(LIMIT) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: sorted_by_phedex.write.format("com.databricks.spark.csv")\ .option("header", "true").save('%s/phedex' % fout) sorted_by_dbs.write.format("com.databricks.spark.csv")\ .option("header", "true").save('%s/dbs' % fout) site_campaign_count_df.write.format("com.databricks.spark.csv")\ .option("header", "true").save('%s/site_campaign_count' % fout) ctx.stop()
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() print("Input arguments: %s" % opts) start_time = time.time() verbose = opts.verbose yarn = opts.yarn inst = opts.inst date = opts.date fout = opts.fout aaa_hdir = opts.aaa_hdir if inst.lower() in ['global', 'phys01', 'phys02', 'phys03']: inst = inst.upper() else: raise Exception('Unsupported DBS instance "%s"' % inst) # Create spark context ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) # Create SQL context to be used for SQL queries sql_context = SQLContext(ctx) # Initialize DBS tables dbs_tables(sql_context, inst=inst, verbose=verbose, tables=['fdf', 'bdf', 'ddf']) # Initialize PhEDEx table to be used in file_block_site table phedex_tables(sql_context, verbose=verbose) # Register clean_site_name to be used with SQL queries sql_context.udf.register("clean_site_name", clean_site_name) # Register tier_from_site_name to be used with SQL queries sql_context.udf.register("tier_from_site_name", tier_from_site_name) # Register dn2uuid to be used with SQL queries sql_context.udf.register("dn2uuid", dn2uuid) # Register parse_app to be used with SQL queries sql_context.udf.register("parse_app", parse_app) # Register stream4app to be used with SQL queries sql_context.udf.register("stream4app", stream4app) # Register parse_dn to be used with SQL queries sql_context.udf.register("parse_dn", parse_dn) f_b_s_start_time = time.time() # Create temp table with file name, block name, site name and site from PhEDEx create_file_block_site_table(ctx, sql_context, verbose) f_b_s_elapsed_time = elapsed_time(f_b_s_start_time) cmssw_start_time = time.time() aggregated_cmssw_df = run_agg_cmssw(date, ctx, sql_context, verbose) cmssw_elapsed_time = elapsed_time(cmssw_start_time) aaa_start_time = time.time() if len(aaa_hdir) > 0: aggregated_aaa_df = run_agg_aaa(date, ctx, sql_context, aaa_hdir, verbose) else: aggregated_aaa_df = run_agg_aaa(date, ctx, sql_context, verbose=verbose) aaa_elapsed_time = elapsed_time(aaa_start_time) eos_start_time = time.time() aggregated_eos_df = run_agg_eos(date, ctx, sql_context, verbose) eos_elapsed_time = elapsed_time(eos_start_time) jm_start_time = time.time() aggregated_jm_df = run_agg_jm(date, ctx, sql_context, verbose) jm_elapsed_time = elapsed_time(jm_start_time) if verbose: print('Will union outputs from all streams to a single dataframe') # Schema for output is: # site name, dataset name, number of accesses, distinct users, stream all_df = aggregated_cmssw_df.unionAll(aggregated_aaa_df) all_df = all_df.unionAll(aggregated_eos_df) all_df = all_df.unionAll(aggregated_jm_df) all_df = all_df.sort(desc("nacc")) if verbose: print('Done joining all outputs to a single dataframe') fout = fout + "/" + short_date_string(date) # output_dataframe(fout + "/Aggregated/CMSSW/" + short_date_string(date), aggregated_cmssw_df, verbose) # output_dataframe(fout + "/Aggregated/AAA/" + short_date_string(date), aggregated_aaa_df, verbose) # output_dataframe(fout + "/Aggregated/EOS/" + short_date_string(date), aggregated_eos_df, verbose) # output_dataframe(fout + "/Aggregated/JobMonitoring/" + short_date_string(date), aggregated_jm_df, verbose) output_dataframe(fout, all_df, verbose) if verbose: cmssw_df_size = aggregated_cmssw_df.count() aaa_df_size = aggregated_aaa_df.count() eos_df_size = aggregated_eos_df.count() jm_df_size = aggregated_jm_df.count() all_df_size = all_df.count() print('CMSSW:') aggregated_cmssw_df.show(10) aggregated_cmssw_df.printSchema() print('AAA:') aggregated_aaa_df.show(10) aggregated_aaa_df.printSchema() print('EOS:') aggregated_eos_df.show(10) aggregated_eos_df.printSchema() print('JobMonitoring:') aggregated_jm_df.show(10) aggregated_jm_df.printSchema() print('Aggregated all:') all_df.show(10) all_df.printSchema() print('Output record count:') print('Output record count CMSSW : %s' % cmssw_df_size) print('Output record count AAA : %s' % aaa_df_size) print('Output record count EOS : %s' % eos_df_size) print('Output record count JobMonitoring : %s' % jm_df_size) print('Output record count Total: : %s' % all_df_size) ctx.stop() print('Start time : %s' % time.strftime('%Y-%m-%d %H:%M:%S GMT', time.gmtime(start_time))) print('End time : %s' % time.strftime('%Y-%m-%d %H:%M:%S GMT', time.gmtime(time.time()))) print('Total elapsed time : %s' % elapsed_time(start_time)) print('FileBlockSite elapsed time : %s' % f_b_s_elapsed_time) print('AAA elapsed time : %s' % aaa_elapsed_time) print('CMSSW elapsed time : %s' % cmssw_elapsed_time) print('EOS elapsed time : %s' % eos_elapsed_time) print('JobMonitoring elapsed time : %s' % jm_elapsed_time)
def run(fout, date, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) sqlContext = HiveContext(ctx) fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:]) todate = fromdate # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) tables.update( phedex_tables(sqlContext, verbose=verbose, fromdate=fromdate, todate=todate)) phedex = tables['phedex_df'] daf = tables['daf'] ddf = tables['ddf'] fdf = tables['fdf'] # DBS dbs_fdf_cols = ['f_dataset_id', 'f_file_size'] dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id'] dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type'] fdf_df = fdf.select(dbs_fdf_cols) ddf_df = ddf.select(dbs_ddf_cols) daf_df = daf.select(dbs_daf_cols) # dataset, dbs_size, dataset_access_type_id dbs_df = fdf_df.join(ddf_df, fdf_df.f_dataset_id == ddf_df.d_dataset_id)\ .drop('f_dataset_id')\ .drop('d_dataset_id')\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumnRenamed('f_file_size', 'size')\ .withColumnRenamed('d_dataset_access_type_id', 'dataset_access_type_id') # dataset, size, dataset_access_type dbs_df = dbs_df.join(daf_df, dbs_df.dataset_access_type_id == daf_df.dataset_access_type_id)\ .drop(dbs_df.dataset_access_type_id)\ .drop(daf_df.dataset_access_type_id) # dataset, dbs_size dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\ .groupBy('dataset')\ .agg({'size':'sum'})\ .withColumnRenamed('sum(size)', 'dbs_size') # PhEDEx size_on_disk_udf = udf(lambda site, size: 0 if site.endswith( ('_MSS', '_Buffer', '_Export')) else size) # dataset, size, site phedex_cols = ['dataset_name', 'block_bytes', 'node_name'] phedex_df = phedex.select(phedex_cols)\ .withColumnRenamed('dataset_name', 'dataset')\ .withColumnRenamed('block_bytes', 'size')\ .withColumnRenamed('node_name', 'site') # dataset, phedex_size, size_on_disk phedex_df = phedex_df.withColumn('size_on_disk', size_on_disk_udf(phedex_df.site, phedex_df.size))\ .groupBy('dataset')\ .agg({'size':'sum', 'size_on_disk': 'sum'})\ .withColumnRenamed('sum(size)', 'phedex_size')\ .withColumnRenamed('sum(size_on_disk)', 'size_on_disk') # dataset, dbs_size, phedex_size, size_on_disk result = phedex_df.join(dbs_df, phedex_df.dataset == dbs_df.dataset)\ .drop(dbs_df.dataset) extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2]) extract_tier_udf = udf(lambda dataset: dataset.split('/')[3]) # campaign, tier, dbs_size, phedex_size, size_on_disk result = result.withColumn('campaign', extract_campaign_udf(result.dataset))\ .withColumn('tier', extract_tier_udf(result.dataset))\ .drop('dataset')\ .groupBy(['campaign', 'tier'])\ .agg({'dbs_size':'sum', 'phedex_size': 'sum', 'size_on_disk': 'sum'})\ .withColumnRenamed('sum(dbs_size)', 'dbs_size')\ .withColumnRenamed('sum(phedex_size)', 'phedex_size')\ .withColumnRenamed('sum(size_on_disk)', 'size_on_disk') # campaign, tier, dbs_size, phedex_size, size_on_disk result = result.withColumn('sum_size', result.dbs_size + result.phedex_size) result = result.orderBy(result.sum_size, ascending=False)\ .drop('sum_size')\ .limit(LIMIT) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: result.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()